agentcassette 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentcassette/__init__.py +83 -0
- agentcassette/_cassette.py +169 -0
- agentcassette/_diff.py +70 -0
- agentcassette/_errors.py +35 -0
- agentcassette/_session.py +249 -0
- agentcassette/_tokens.py +86 -0
- agentcassette/py.typed +0 -0
- agentcassette-0.1.0.dist-info/METADATA +290 -0
- agentcassette-0.1.0.dist-info/RECORD +11 -0
- agentcassette-0.1.0.dist-info/WHEEL +4 -0
- agentcassette-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
agentcassette — Deterministic agent test recorder and replayer.
|
|
3
|
+
|
|
4
|
+
Zero dependencies. Pure Python stdlib.
|
|
5
|
+
|
|
6
|
+
Testing agents is painful: live LLM calls are expensive, non-deterministic, and
|
|
7
|
+
slow. agentcassette records a real run once, then replays it forever from a JSON
|
|
8
|
+
"cassette" — no network, no cost, fully deterministic.
|
|
9
|
+
|
|
10
|
+
Quick start
|
|
11
|
+
-----------
|
|
12
|
+
Wrap the callable(s) you want captured once (your model-call function, and
|
|
13
|
+
optionally your tools), then drive recording and replay with context managers::
|
|
14
|
+
|
|
15
|
+
import agentcassette
|
|
16
|
+
from agentcassette import record, replay
|
|
17
|
+
|
|
18
|
+
call_model = agentcassette.intercept(call_model, kind="llm")
|
|
19
|
+
|
|
20
|
+
# Record a real run:
|
|
21
|
+
with record("cassettes/flight_search.json"):
|
|
22
|
+
my_agent.run("Find flights to NYC under $300")
|
|
23
|
+
|
|
24
|
+
# Replay it in a test — no API calls happen:
|
|
25
|
+
def test_flight_search():
|
|
26
|
+
with replay("cassettes/flight_search.json"):
|
|
27
|
+
result = my_agent.run("Find flights to NYC under $300")
|
|
28
|
+
assert result.success
|
|
29
|
+
|
|
30
|
+
Catch regressions with strict replay::
|
|
31
|
+
|
|
32
|
+
from agentcassette import replay, DivergenceError
|
|
33
|
+
|
|
34
|
+
with replay("cassettes/flight_search.json", strict=True):
|
|
35
|
+
my_agent.run("Find flights to NYC under $300") # DivergenceError on drift
|
|
36
|
+
|
|
37
|
+
Inspect and diff cassettes::
|
|
38
|
+
|
|
39
|
+
from agentcassette import Cassette, diff_cassettes
|
|
40
|
+
|
|
41
|
+
c = Cassette.load("cassettes/flight_search.json")
|
|
42
|
+
c.num_steps, c.total_input_tokens, c.total_output_tokens
|
|
43
|
+
c.redact("api_key") # scrub secrets before committing
|
|
44
|
+
|
|
45
|
+
delta = diff_cassettes("cassettes/v1.json", "cassettes/v2.json")
|
|
46
|
+
delta.new_calls, delta.dropped_calls, delta.token_delta
|
|
47
|
+
|
|
48
|
+
See the project README for the full cassette format and API reference.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
from __future__ import annotations
|
|
52
|
+
|
|
53
|
+
from importlib.metadata import PackageNotFoundError, version as _version
|
|
54
|
+
|
|
55
|
+
from ._cassette import Cassette
|
|
56
|
+
from ._diff import CassetteDiff, diff_cassettes
|
|
57
|
+
from ._errors import (
|
|
58
|
+
AgentCassetteError,
|
|
59
|
+
CassetteNotFound,
|
|
60
|
+
DivergenceError,
|
|
61
|
+
ReplayExhausted,
|
|
62
|
+
)
|
|
63
|
+
from ._session import Player, Recorder, intercept, record, replay
|
|
64
|
+
|
|
65
|
+
__all__ = [
|
|
66
|
+
"record",
|
|
67
|
+
"replay",
|
|
68
|
+
"intercept",
|
|
69
|
+
"Cassette",
|
|
70
|
+
"Recorder",
|
|
71
|
+
"Player",
|
|
72
|
+
"diff_cassettes",
|
|
73
|
+
"CassetteDiff",
|
|
74
|
+
"AgentCassetteError",
|
|
75
|
+
"CassetteNotFound",
|
|
76
|
+
"ReplayExhausted",
|
|
77
|
+
"DivergenceError",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
__version__ = _version("agentcassette")
|
|
82
|
+
except PackageNotFoundError: # running from a source tree without install metadata
|
|
83
|
+
__version__ = "0.0.0"
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""The Cassette: agentcassette's on-disk recording format.
|
|
2
|
+
|
|
3
|
+
A cassette is a plain, human-readable JSON file — readable in a diff, safe to
|
|
4
|
+
commit to git, and portable across machines. Its shape:
|
|
5
|
+
|
|
6
|
+
{
|
|
7
|
+
"version": 1,
|
|
8
|
+
"recorded_at": "2026-06-30T12:00:00Z",
|
|
9
|
+
"model": "claude-sonnet-4-6", # optional label
|
|
10
|
+
"duration_ms": 1832.4, # wall time of the whole recorded run
|
|
11
|
+
"steps": [
|
|
12
|
+
{
|
|
13
|
+
"index": 0,
|
|
14
|
+
"type": "llm", # "llm" | "tool" | "call"
|
|
15
|
+
"name": "call_model",
|
|
16
|
+
"arguments": {"args": [...], "kwargs": {...}},
|
|
17
|
+
"result": {...},
|
|
18
|
+
"input_tokens": 420,
|
|
19
|
+
"output_tokens": 88,
|
|
20
|
+
"duration_ms": 512.0
|
|
21
|
+
}
|
|
22
|
+
]
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
Every intercepted call becomes one step, in the exact order it happened.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import json
|
|
31
|
+
import os
|
|
32
|
+
from typing import Any
|
|
33
|
+
|
|
34
|
+
from ._errors import CassetteNotFound
|
|
35
|
+
|
|
36
|
+
CASSETTE_VERSION = 1
|
|
37
|
+
REDACTION_PLACEHOLDER = "****"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def to_jsonable(obj: Any) -> Any:
|
|
41
|
+
"""Coerce an arbitrary value into something ``json.dumps`` accepts.
|
|
42
|
+
|
|
43
|
+
JSON-native values pass through unchanged. Sets and tuples become lists.
|
|
44
|
+
Objects are reduced to their ``__dict__`` when available, otherwise their
|
|
45
|
+
``repr`` wrapped in a marker so the cassette stays valid JSON.
|
|
46
|
+
"""
|
|
47
|
+
if obj is None or isinstance(obj, (bool, int, float, str)):
|
|
48
|
+
return obj
|
|
49
|
+
if isinstance(obj, dict):
|
|
50
|
+
return {str(k): to_jsonable(v) for k, v in obj.items()}
|
|
51
|
+
if isinstance(obj, (list, tuple, set, frozenset)):
|
|
52
|
+
return [to_jsonable(v) for v in obj]
|
|
53
|
+
# Common SDK response objects expose their fields via model_dump()/dict().
|
|
54
|
+
for method in ("model_dump", "dict", "to_dict"):
|
|
55
|
+
fn = getattr(obj, method, None)
|
|
56
|
+
if callable(fn):
|
|
57
|
+
try:
|
|
58
|
+
return to_jsonable(fn())
|
|
59
|
+
except Exception: # pragma: no cover - defensive
|
|
60
|
+
break
|
|
61
|
+
data = getattr(obj, "__dict__", None)
|
|
62
|
+
if isinstance(data, dict) and data:
|
|
63
|
+
return {"__type__": type(obj).__name__, **{str(k): to_jsonable(v) for k, v in data.items()}}
|
|
64
|
+
return {"__repr__": repr(obj)}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _redact_in_place(obj: Any, key: str, replacement: str) -> None:
|
|
68
|
+
if isinstance(obj, dict):
|
|
69
|
+
for k in list(obj.keys()):
|
|
70
|
+
if k == key:
|
|
71
|
+
obj[k] = replacement
|
|
72
|
+
else:
|
|
73
|
+
_redact_in_place(obj[k], key, replacement)
|
|
74
|
+
elif isinstance(obj, list):
|
|
75
|
+
for item in obj:
|
|
76
|
+
_redact_in_place(item, key, replacement)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class Cassette:
|
|
80
|
+
"""An ordered recording of intercepted calls.
|
|
81
|
+
|
|
82
|
+
Usually produced by :func:`agentcassette.record` and consumed by
|
|
83
|
+
:func:`agentcassette.replay`, but can also be loaded directly for inspection.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
steps: list[dict] | None = None,
|
|
89
|
+
*,
|
|
90
|
+
model: str | None = None,
|
|
91
|
+
recorded_at: str | None = None,
|
|
92
|
+
duration_ms: float = 0.0,
|
|
93
|
+
version: int = CASSETTE_VERSION,
|
|
94
|
+
) -> None:
|
|
95
|
+
self.version = version
|
|
96
|
+
self.recorded_at = recorded_at
|
|
97
|
+
self.model = model
|
|
98
|
+
self.duration_ms = duration_ms
|
|
99
|
+
self.steps: list[dict] = steps if steps is not None else []
|
|
100
|
+
|
|
101
|
+
# ---- persistence ----------------------------------------------------
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_dict(cls, data: dict) -> "Cassette":
|
|
104
|
+
return cls(
|
|
105
|
+
steps=list(data.get("steps", [])),
|
|
106
|
+
model=data.get("model"),
|
|
107
|
+
recorded_at=data.get("recorded_at"),
|
|
108
|
+
duration_ms=data.get("duration_ms", 0.0),
|
|
109
|
+
version=data.get("version", CASSETTE_VERSION),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def to_dict(self) -> dict:
|
|
113
|
+
return {
|
|
114
|
+
"version": self.version,
|
|
115
|
+
"recorded_at": self.recorded_at,
|
|
116
|
+
"model": self.model,
|
|
117
|
+
"duration_ms": self.duration_ms,
|
|
118
|
+
"steps": self.steps,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def load(cls, path: str | os.PathLike) -> "Cassette":
|
|
123
|
+
"""Load a cassette from disk. Raises :class:`CassetteNotFound` if absent."""
|
|
124
|
+
if not os.path.exists(path):
|
|
125
|
+
raise CassetteNotFound(f"No cassette at {os.fspath(path)!r}")
|
|
126
|
+
with open(path, "r", encoding="utf-8") as fh:
|
|
127
|
+
return cls.from_dict(json.load(fh))
|
|
128
|
+
|
|
129
|
+
def save(self, path: str | os.PathLike) -> None:
|
|
130
|
+
"""Write the cassette to disk as pretty-printed JSON, creating dirs."""
|
|
131
|
+
parent = os.path.dirname(os.fspath(path))
|
|
132
|
+
if parent:
|
|
133
|
+
os.makedirs(parent, exist_ok=True)
|
|
134
|
+
with open(path, "w", encoding="utf-8") as fh:
|
|
135
|
+
json.dump(self.to_dict(), fh, indent=2, ensure_ascii=False)
|
|
136
|
+
fh.write("\n")
|
|
137
|
+
|
|
138
|
+
# ---- inspection -----------------------------------------------------
|
|
139
|
+
@property
|
|
140
|
+
def num_steps(self) -> int:
|
|
141
|
+
return len(self.steps)
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def total_input_tokens(self) -> int:
|
|
145
|
+
return sum(int(s.get("input_tokens", 0)) for s in self.steps)
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def total_output_tokens(self) -> int:
|
|
149
|
+
return sum(int(s.get("output_tokens", 0)) for s in self.steps)
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def total_tokens(self) -> int:
|
|
153
|
+
return self.total_input_tokens + self.total_output_tokens
|
|
154
|
+
|
|
155
|
+
def redact(self, key: str, replacement: str = REDACTION_PLACEHOLDER) -> "Cassette":
|
|
156
|
+
"""Replace every value stored under ``key`` (at any depth). Returns self."""
|
|
157
|
+
for step in self.steps:
|
|
158
|
+
_redact_in_place(step, key, replacement)
|
|
159
|
+
return self
|
|
160
|
+
|
|
161
|
+
def __len__(self) -> int:
|
|
162
|
+
return len(self.steps)
|
|
163
|
+
|
|
164
|
+
def __repr__(self) -> str:
|
|
165
|
+
return (
|
|
166
|
+
f"Cassette(steps={self.num_steps}, "
|
|
167
|
+
f"input_tokens={self.total_input_tokens}, "
|
|
168
|
+
f"output_tokens={self.total_output_tokens})"
|
|
169
|
+
)
|
agentcassette/_diff.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Diff two cassettes to spot behavioral drift between agent versions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
from ._cassette import Cassette
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CassetteDiff:
|
|
13
|
+
"""The delta between two cassettes (``a`` = baseline, ``b`` = new).
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
new_calls: Call names that appear more often in ``b`` than ``a``.
|
|
17
|
+
dropped_calls: Call names that appear more often in ``a`` than ``b``.
|
|
18
|
+
changed_calls: Steps at the same index whose name/arguments/result differ.
|
|
19
|
+
token_delta: ``b`` total tokens minus ``a`` total tokens.
|
|
20
|
+
input_token_delta / output_token_delta: the same, split by direction.
|
|
21
|
+
step_delta: ``b`` step count minus ``a`` step count.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, a: Cassette, b: Cassette) -> None:
|
|
25
|
+
names_a = Counter(s.get("name") for s in a.steps)
|
|
26
|
+
names_b = Counter(s.get("name") for s in b.steps)
|
|
27
|
+
|
|
28
|
+
self.new_calls: list[str] = sorted((names_b - names_a).elements())
|
|
29
|
+
self.dropped_calls: list[str] = sorted((names_a - names_b).elements())
|
|
30
|
+
|
|
31
|
+
self.changed_calls: list[dict] = []
|
|
32
|
+
for i in range(min(len(a.steps), len(b.steps))):
|
|
33
|
+
sa, sb = a.steps[i], b.steps[i]
|
|
34
|
+
if (
|
|
35
|
+
sa.get("name") != sb.get("name")
|
|
36
|
+
or sa.get("arguments") != sb.get("arguments")
|
|
37
|
+
or sa.get("result") != sb.get("result")
|
|
38
|
+
):
|
|
39
|
+
self.changed_calls.append({"index": i, "a": sa, "b": sb})
|
|
40
|
+
|
|
41
|
+
self.input_token_delta = b.total_input_tokens - a.total_input_tokens
|
|
42
|
+
self.output_token_delta = b.total_output_tokens - a.total_output_tokens
|
|
43
|
+
self.token_delta = b.total_tokens - a.total_tokens
|
|
44
|
+
self.step_delta = b.num_steps - a.num_steps
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def identical(self) -> bool:
|
|
48
|
+
"""True when the two cassettes have the same calls, args, and results."""
|
|
49
|
+
return (
|
|
50
|
+
not self.new_calls
|
|
51
|
+
and not self.dropped_calls
|
|
52
|
+
and not self.changed_calls
|
|
53
|
+
and self.step_delta == 0
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def __repr__(self) -> str:
|
|
57
|
+
return (
|
|
58
|
+
f"CassetteDiff(new={self.new_calls}, dropped={self.dropped_calls}, "
|
|
59
|
+
f"changed={len(self.changed_calls)}, token_delta={self.token_delta})"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def diff_cassettes(
|
|
64
|
+
a: Union[str, os.PathLike, Cassette],
|
|
65
|
+
b: Union[str, os.PathLike, Cassette],
|
|
66
|
+
) -> CassetteDiff:
|
|
67
|
+
"""Compare two cassettes given as paths or already-loaded :class:`Cassette`s."""
|
|
68
|
+
cassette_a = a if isinstance(a, Cassette) else Cassette.load(a)
|
|
69
|
+
cassette_b = b if isinstance(b, Cassette) else Cassette.load(b)
|
|
70
|
+
return CassetteDiff(cassette_a, cassette_b)
|
agentcassette/_errors.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Exception types raised by agentcassette."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AgentCassetteError(Exception):
|
|
7
|
+
"""Base class for all agentcassette errors."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CassetteNotFound(AgentCassetteError):
|
|
11
|
+
"""Raised when a cassette file does not exist during replay."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ReplayExhausted(AgentCassetteError):
|
|
15
|
+
"""Raised when the agent makes more intercepted calls than the cassette recorded."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DivergenceError(AgentCassetteError):
|
|
19
|
+
"""Raised during a strict replay when a call does not match the recording.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
step_index: Position of the diverging call in the cassette.
|
|
23
|
+
expected: The recorded ``{"name", "arguments"}`` at that position.
|
|
24
|
+
actual: The ``{"name", "arguments"}`` the agent produced instead.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, step_index: int, expected: dict, actual: dict) -> None:
|
|
28
|
+
self.step_index = step_index
|
|
29
|
+
self.expected = expected
|
|
30
|
+
self.actual = actual
|
|
31
|
+
super().__init__(
|
|
32
|
+
f"Replay diverged at step {step_index}: "
|
|
33
|
+
f"expected {expected.get('name')!r} with {expected.get('arguments')!r}, "
|
|
34
|
+
f"got {actual.get('name')!r} with {actual.get('arguments')!r}"
|
|
35
|
+
)
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Recording and replaying of intercepted calls.
|
|
2
|
+
|
|
3
|
+
The design is a deliberate, honest seam rather than monkey-patching: you wrap
|
|
4
|
+
the callables you want captured once with :func:`intercept`. A thread-local
|
|
5
|
+
"active session" then decides what happens each time a wrapped callable runs:
|
|
6
|
+
|
|
7
|
+
* No active session → the real function runs (production is unaffected).
|
|
8
|
+
* Inside ``record()`` → the real function runs and the call is taped.
|
|
9
|
+
* Inside ``replay()`` → the recorded result is returned; the real function is
|
|
10
|
+
never called (no network, no cost, fully deterministic).
|
|
11
|
+
|
|
12
|
+
This keeps agentcassette provider-agnostic and truly zero-dependency: it works
|
|
13
|
+
with OpenAI, Anthropic, a raw ``requests`` call, or a local model equally.
|
|
14
|
+
|
|
15
|
+
Both synchronous and ``async def`` callables are supported — :func:`intercept`
|
|
16
|
+
detects coroutine functions and returns an awaitable wrapper for them.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import functools
|
|
22
|
+
import inspect
|
|
23
|
+
import threading
|
|
24
|
+
import time
|
|
25
|
+
from contextlib import contextmanager
|
|
26
|
+
from datetime import datetime, timezone
|
|
27
|
+
from typing import Any, Callable, Iterator
|
|
28
|
+
|
|
29
|
+
from ._cassette import Cassette, to_jsonable
|
|
30
|
+
from ._errors import DivergenceError, ReplayExhausted
|
|
31
|
+
from ._tokens import count_tokens
|
|
32
|
+
|
|
33
|
+
_local = threading.local()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _current_session() -> "Recorder | Player | None":
|
|
37
|
+
return getattr(_local, "session", None)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _set_session(session: "Recorder | Player | None") -> None:
|
|
41
|
+
_local.session = session
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _now_iso() -> str:
|
|
45
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _normalize_args(args: tuple, kwargs: dict) -> dict:
|
|
49
|
+
return {"args": to_jsonable(list(args)), "kwargs": to_jsonable(dict(kwargs))}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# The interception seam
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
def intercept(
|
|
56
|
+
fn: Callable | None = None,
|
|
57
|
+
*,
|
|
58
|
+
name: str | None = None,
|
|
59
|
+
kind: str = "call",
|
|
60
|
+
) -> Callable:
|
|
61
|
+
"""Mark a callable as recordable/replayable.
|
|
62
|
+
|
|
63
|
+
Usable as ``intercept(fn)``, ``intercept(fn, name=..., kind="llm")``, or as a
|
|
64
|
+
decorator ``@intercept`` / ``@intercept(kind="tool")``. Works on both regular
|
|
65
|
+
functions and ``async def`` coroutine functions.
|
|
66
|
+
|
|
67
|
+
Outside of a ``record``/``replay`` block the wrapped callable behaves exactly
|
|
68
|
+
like the original, so it is safe to leave in production code.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def decorator(func: Callable) -> Callable:
|
|
72
|
+
call_name = name or getattr(func, "__name__", "call")
|
|
73
|
+
|
|
74
|
+
if inspect.iscoroutinefunction(func):
|
|
75
|
+
|
|
76
|
+
@functools.wraps(func)
|
|
77
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
78
|
+
session = _current_session()
|
|
79
|
+
if session is None:
|
|
80
|
+
return await func(*args, **kwargs)
|
|
81
|
+
return await session.handle_async(call_name, kind, func, args, kwargs)
|
|
82
|
+
|
|
83
|
+
async_wrapper.__agentcassette_intercepted__ = True # type: ignore[attr-defined]
|
|
84
|
+
return async_wrapper
|
|
85
|
+
|
|
86
|
+
@functools.wraps(func)
|
|
87
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
88
|
+
session = _current_session()
|
|
89
|
+
if session is None:
|
|
90
|
+
return func(*args, **kwargs)
|
|
91
|
+
return session.handle(call_name, kind, func, args, kwargs)
|
|
92
|
+
|
|
93
|
+
wrapper.__agentcassette_intercepted__ = True # type: ignore[attr-defined]
|
|
94
|
+
return wrapper
|
|
95
|
+
|
|
96
|
+
if fn is not None:
|
|
97
|
+
return decorator(fn)
|
|
98
|
+
return decorator
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# Recorder
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
class Recorder:
|
|
105
|
+
"""Captures each intercepted call into an ordered list of steps."""
|
|
106
|
+
|
|
107
|
+
def __init__(self, *, model: str | None = None) -> None:
|
|
108
|
+
self.model = model
|
|
109
|
+
self.steps: list[dict] = []
|
|
110
|
+
self.cassette: Cassette | None = None # populated after the block exits
|
|
111
|
+
|
|
112
|
+
def _append(
|
|
113
|
+
self, name: str, kind: str, args: tuple, kwargs: dict, result: Any, duration_ms: float
|
|
114
|
+
) -> Any:
|
|
115
|
+
arguments = _normalize_args(args, kwargs)
|
|
116
|
+
jresult = to_jsonable(result)
|
|
117
|
+
input_tokens, output_tokens = count_tokens(arguments, jresult)
|
|
118
|
+
self.steps.append(
|
|
119
|
+
{
|
|
120
|
+
"index": len(self.steps),
|
|
121
|
+
"type": kind,
|
|
122
|
+
"name": name,
|
|
123
|
+
"arguments": arguments,
|
|
124
|
+
"result": jresult,
|
|
125
|
+
"input_tokens": input_tokens,
|
|
126
|
+
"output_tokens": output_tokens,
|
|
127
|
+
"duration_ms": round(duration_ms, 3),
|
|
128
|
+
}
|
|
129
|
+
)
|
|
130
|
+
return result # the live run still gets the real object
|
|
131
|
+
|
|
132
|
+
def handle(self, name: str, kind: str, func: Callable, args: tuple, kwargs: dict) -> Any:
|
|
133
|
+
start = time.perf_counter()
|
|
134
|
+
result = func(*args, **kwargs)
|
|
135
|
+
return self._append(name, kind, args, kwargs, result, (time.perf_counter() - start) * 1000)
|
|
136
|
+
|
|
137
|
+
async def handle_async(
|
|
138
|
+
self, name: str, kind: str, func: Callable, args: tuple, kwargs: dict
|
|
139
|
+
) -> Any:
|
|
140
|
+
start = time.perf_counter()
|
|
141
|
+
result = await func(*args, **kwargs)
|
|
142
|
+
return self._append(name, kind, args, kwargs, result, (time.perf_counter() - start) * 1000)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
# Player
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
class Player:
|
|
149
|
+
"""Serves recorded results back in order, without calling the real function."""
|
|
150
|
+
|
|
151
|
+
def __init__(self, steps: list[dict], *, strict: bool = False) -> None:
|
|
152
|
+
self._steps = steps
|
|
153
|
+
self.strict = strict
|
|
154
|
+
self._cursor = 0
|
|
155
|
+
self.divergences: list[dict] = []
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def cursor(self) -> int:
|
|
159
|
+
return self._cursor
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def remaining(self) -> int:
|
|
163
|
+
return len(self._steps) - self._cursor
|
|
164
|
+
|
|
165
|
+
def _serve(self, name: str, kind: str, args: tuple, kwargs: dict) -> Any:
|
|
166
|
+
if self._cursor >= len(self._steps):
|
|
167
|
+
raise ReplayExhausted(
|
|
168
|
+
f"cassette recorded {len(self._steps)} calls but the agent asked for more "
|
|
169
|
+
f"(next was {name!r})"
|
|
170
|
+
)
|
|
171
|
+
step = self._steps[self._cursor]
|
|
172
|
+
self._cursor += 1
|
|
173
|
+
|
|
174
|
+
actual = {"name": name, "arguments": _normalize_args(args, kwargs)}
|
|
175
|
+
expected = {"name": step.get("name"), "arguments": step.get("arguments")}
|
|
176
|
+
if expected != actual:
|
|
177
|
+
if self.strict:
|
|
178
|
+
raise DivergenceError(step.get("index", self._cursor - 1), expected, actual)
|
|
179
|
+
self.divergences.append(
|
|
180
|
+
{"index": step.get("index"), "expected": expected, "actual": actual}
|
|
181
|
+
)
|
|
182
|
+
return step.get("result")
|
|
183
|
+
|
|
184
|
+
def handle(self, name: str, kind: str, func: Callable, args: tuple, kwargs: dict) -> Any:
|
|
185
|
+
return self._serve(name, kind, args, kwargs)
|
|
186
|
+
|
|
187
|
+
async def handle_async(
|
|
188
|
+
self, name: str, kind: str, func: Callable, args: tuple, kwargs: dict
|
|
189
|
+
) -> Any:
|
|
190
|
+
return self._serve(name, kind, args, kwargs)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
# Context managers
|
|
195
|
+
# ---------------------------------------------------------------------------
|
|
196
|
+
@contextmanager
|
|
197
|
+
def record(
|
|
198
|
+
path,
|
|
199
|
+
*,
|
|
200
|
+
model: str | None = None,
|
|
201
|
+
redact: "list[str] | None" = None,
|
|
202
|
+
) -> Iterator[Recorder]:
|
|
203
|
+
"""Record every intercepted call made inside the block to a cassette file.
|
|
204
|
+
|
|
205
|
+
The cassette is written on clean exit only; if the block raises, nothing is
|
|
206
|
+
saved. Pass ``redact=["api_key", ...]`` to scrub those keys before writing.
|
|
207
|
+
"""
|
|
208
|
+
if _current_session() is not None:
|
|
209
|
+
raise RuntimeError("agentcassette: a record/replay session is already active")
|
|
210
|
+
recorder = Recorder(model=model)
|
|
211
|
+
_set_session(recorder)
|
|
212
|
+
start = time.perf_counter()
|
|
213
|
+
try:
|
|
214
|
+
yield recorder
|
|
215
|
+
except BaseException:
|
|
216
|
+
_set_session(None)
|
|
217
|
+
raise
|
|
218
|
+
_set_session(None)
|
|
219
|
+
|
|
220
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
221
|
+
cassette = Cassette(
|
|
222
|
+
steps=recorder.steps,
|
|
223
|
+
model=model,
|
|
224
|
+
recorded_at=_now_iso(),
|
|
225
|
+
duration_ms=round(duration_ms, 3),
|
|
226
|
+
)
|
|
227
|
+
for key in redact or []:
|
|
228
|
+
cassette.redact(key)
|
|
229
|
+
cassette.save(path)
|
|
230
|
+
recorder.cassette = cassette
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@contextmanager
|
|
234
|
+
def replay(path, *, strict: bool = False) -> Iterator[Player]:
|
|
235
|
+
"""Replay a cassette: intercepted calls return recorded results, no real work.
|
|
236
|
+
|
|
237
|
+
With ``strict=True`` any call whose name or arguments differ from the
|
|
238
|
+
recording raises :class:`DivergenceError`. With ``strict=False`` (default)
|
|
239
|
+
divergences are collected on the yielded player's ``divergences`` list.
|
|
240
|
+
"""
|
|
241
|
+
if _current_session() is not None:
|
|
242
|
+
raise RuntimeError("agentcassette: a record/replay session is already active")
|
|
243
|
+
cassette = Cassette.load(path)
|
|
244
|
+
player = Player(cassette.steps, strict=strict)
|
|
245
|
+
_set_session(player)
|
|
246
|
+
try:
|
|
247
|
+
yield player
|
|
248
|
+
finally:
|
|
249
|
+
_set_session(None)
|
agentcassette/_tokens.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Token accounting for recorded steps.
|
|
2
|
+
|
|
3
|
+
agentcassette has zero dependencies, so it cannot call a real tokenizer
|
|
4
|
+
(``tiktoken`` and friends are third-party). Instead it:
|
|
5
|
+
|
|
6
|
+
1. Prefers exact counts when the recorded response carries a usage block
|
|
7
|
+
(OpenAI ``usage.prompt_tokens`` / Anthropic ``usage.input_tokens`` etc.).
|
|
8
|
+
2. Falls back to a character-length heuristic (~4 chars per token), which is
|
|
9
|
+
accurate enough for budgeting and regression comparisons.
|
|
10
|
+
|
|
11
|
+
The heuristic is intentionally simple and deterministic so cassettes recorded
|
|
12
|
+
on one machine reproduce identical counts on another.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
# Common usage-block key spellings across providers, mapped to (input, output).
|
|
21
|
+
_INPUT_KEYS = ("input_tokens", "prompt_tokens")
|
|
22
|
+
_OUTPUT_KEYS = ("output_tokens", "completion_tokens")
|
|
23
|
+
|
|
24
|
+
_CHARS_PER_TOKEN = 4
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def estimate_tokens(obj: Any) -> int:
|
|
28
|
+
"""Estimate token count for an arbitrary JSON-able object via char heuristic."""
|
|
29
|
+
if obj is None:
|
|
30
|
+
return 0
|
|
31
|
+
if isinstance(obj, str):
|
|
32
|
+
text = obj
|
|
33
|
+
else:
|
|
34
|
+
try:
|
|
35
|
+
text = json.dumps(obj, sort_keys=True, default=str)
|
|
36
|
+
except (TypeError, ValueError):
|
|
37
|
+
text = str(obj)
|
|
38
|
+
if not text:
|
|
39
|
+
return 0
|
|
40
|
+
# Round up so any non-empty payload counts as at least one token.
|
|
41
|
+
return max(1, -(-len(text) // _CHARS_PER_TOKEN))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _find_usage(obj: Any) -> dict | None:
|
|
45
|
+
"""Return the first dict named 'usage' found anywhere in a nested structure."""
|
|
46
|
+
if isinstance(obj, dict):
|
|
47
|
+
usage = obj.get("usage")
|
|
48
|
+
if isinstance(usage, dict):
|
|
49
|
+
return usage
|
|
50
|
+
for value in obj.values():
|
|
51
|
+
found = _find_usage(value)
|
|
52
|
+
if found is not None:
|
|
53
|
+
return found
|
|
54
|
+
elif isinstance(obj, (list, tuple)):
|
|
55
|
+
for item in obj:
|
|
56
|
+
found = _find_usage(item)
|
|
57
|
+
if found is not None:
|
|
58
|
+
return found
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _pick(usage: dict, keys: tuple) -> int | None:
|
|
63
|
+
for key in keys:
|
|
64
|
+
value = usage.get(key)
|
|
65
|
+
if isinstance(value, (int, float)):
|
|
66
|
+
return int(value)
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def count_tokens(request: Any, response: Any) -> tuple[int, int]:
|
|
71
|
+
"""Return ``(input_tokens, output_tokens)`` for a recorded call.
|
|
72
|
+
|
|
73
|
+
Uses an exact usage block from the response when present, otherwise falls
|
|
74
|
+
back to the character heuristic over the request (input) and response
|
|
75
|
+
(output) payloads.
|
|
76
|
+
"""
|
|
77
|
+
usage = _find_usage(response)
|
|
78
|
+
if usage is not None:
|
|
79
|
+
input_tokens = _pick(usage, _INPUT_KEYS)
|
|
80
|
+
output_tokens = _pick(usage, _OUTPUT_KEYS)
|
|
81
|
+
if input_tokens is not None or output_tokens is not None:
|
|
82
|
+
return (
|
|
83
|
+
input_tokens if input_tokens is not None else estimate_tokens(request),
|
|
84
|
+
output_tokens if output_tokens is not None else estimate_tokens(response),
|
|
85
|
+
)
|
|
86
|
+
return estimate_tokens(request), estimate_tokens(response)
|
agentcassette/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentcassette
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deterministic agent test recorder and replayer. Record live runs, replay as mocks. Zero dependencies.
|
|
5
|
+
Project-URL: Homepage, https://github.com/aenealabs/agentcassette
|
|
6
|
+
Project-URL: Repository, https://github.com/aenealabs/agentcassette
|
|
7
|
+
Project-URL: Issues, https://github.com/aenealabs/agentcassette/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/aenealabs/agentcassette/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: LaVon Rutledge <32437530+lavrut@users.noreply.github.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ai-agents,cassette,deterministic,llm,mocking,record,replay,testing,vcr
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Classifier: Topic :: Software Development :: Testing
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: hatch; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# agentcassette
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/agentcassette/)
|
|
36
|
+
[](https://pypi.org/project/agentcassette/)
|
|
37
|
+
[](https://github.com/aenealabs/agentcassette/actions/workflows/ci.yml)
|
|
38
|
+
[](LICENSE)
|
|
39
|
+
[](pyproject.toml)
|
|
40
|
+
|
|
41
|
+
**Deterministic agent test recorder and replayer.**
|
|
42
|
+
|
|
43
|
+
Record a real agent run once, replay it forever as a mock — no network, no cost, fully deterministic. Like VCR/pytest-recording, but purpose-built for LLM agents and with zero dependencies.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import agentcassette
|
|
47
|
+
from agentcassette import record, replay
|
|
48
|
+
|
|
49
|
+
call_model = agentcassette.intercept(call_model, kind="llm")
|
|
50
|
+
|
|
51
|
+
# Record a real run once:
|
|
52
|
+
with record("cassettes/flight_search.json"):
|
|
53
|
+
my_agent.run("Find flights to NYC under $300")
|
|
54
|
+
|
|
55
|
+
# Replay it in tests — no API calls, no tokens spent, same result every time:
|
|
56
|
+
def test_flight_search():
|
|
57
|
+
with replay("cassettes/flight_search.json"):
|
|
58
|
+
result = my_agent.run("Find flights to NYC under $300")
|
|
59
|
+
assert result.success
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Why agentcassette?
|
|
63
|
+
|
|
64
|
+
Testing agents is painful. Live LLM calls are **expensive** (every test run costs money), **non-deterministic** (a different answer each time), and **slow** (seconds per call). So most teams either skip agent testing or maintain a costly, flaky integration suite.
|
|
65
|
+
|
|
66
|
+
agentcassette records the real calls an agent makes into a plain-JSON **cassette**, then replays them on demand. Your tests become fast, free, and deterministic — and you can assert on exactly what the agent did.
|
|
67
|
+
|
|
68
|
+
Unlike VCR-style tools that monkey-patch the HTTP layer, agentcassette uses an explicit, honest seam: you wrap the callables you want captured. That keeps it **provider-agnostic** (OpenAI, Anthropic, Gemini, a raw `requests` call, or a local model all work identically) and **truly zero-dependency**.
|
|
69
|
+
|
|
70
|
+
## Installation
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install agentcassette
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Requires Python 3.9+. No other dependencies, ever.
|
|
77
|
+
|
|
78
|
+
## Quick Start
|
|
79
|
+
|
|
80
|
+
### 1. Wrap what you want captured
|
|
81
|
+
|
|
82
|
+
Wrap your model-call function once (and any tools you want taped). Outside a record/replay block, wrapped callables behave exactly like the original — safe to leave in production code.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import agentcassette
|
|
86
|
+
|
|
87
|
+
# As a wrapper:
|
|
88
|
+
call_model = agentcassette.intercept(call_model, kind="llm")
|
|
89
|
+
|
|
90
|
+
# Or as a decorator:
|
|
91
|
+
@agentcassette.intercept(kind="tool")
|
|
92
|
+
def search_web(query: str) -> list[str]:
|
|
93
|
+
...
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### 2. Record a real run
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from agentcassette import record
|
|
100
|
+
|
|
101
|
+
with record("cassettes/flight_search.json", model="claude-sonnet-4-6"):
|
|
102
|
+
my_agent.run("Find flights to NYC under $300")
|
|
103
|
+
# Cassette is written on clean exit.
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 3. Replay it in your tests
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from agentcassette import replay
|
|
110
|
+
|
|
111
|
+
def test_flight_search():
|
|
112
|
+
with replay("cassettes/flight_search.json"):
|
|
113
|
+
result = my_agent.run("Find flights to NYC under $300")
|
|
114
|
+
assert result.success
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
During replay, every intercepted call returns its recorded result and the real function is **never called**.
|
|
118
|
+
|
|
119
|
+
## Async agents
|
|
120
|
+
|
|
121
|
+
`intercept` detects `async def` callables and returns an awaitable wrapper, so async agents work the same way — including a mix of async and sync tools in one run:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
import agentcassette
|
|
125
|
+
from agentcassette import record, replay
|
|
126
|
+
|
|
127
|
+
acall_model = agentcassette.intercept(acall_model, kind="llm") # an async def
|
|
128
|
+
|
|
129
|
+
async def agent(task):
|
|
130
|
+
plan = await acall_model(f"plan: {task}")
|
|
131
|
+
...
|
|
132
|
+
|
|
133
|
+
with record("cassettes/run.json"):
|
|
134
|
+
asyncio.run(agent("book a trip"))
|
|
135
|
+
|
|
136
|
+
with replay("cassettes/run.json"):
|
|
137
|
+
asyncio.run(agent("book a trip")) # awaited calls served from the cassette
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Catching regressions with strict replay
|
|
141
|
+
|
|
142
|
+
By default, replay serves recorded results best-effort and collects any divergences. With `strict=True`, a call whose name or arguments differ from the recording raises `DivergenceError` — turning your cassette into a behavioral contract.
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from agentcassette import replay, DivergenceError
|
|
146
|
+
|
|
147
|
+
with replay("cassettes/flight_search.json", strict=True):
|
|
148
|
+
my_agent.run("Find flights to NYC under $300") # raises on drift
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Best-effort mode exposes what changed without failing:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
with replay("cassettes/flight_search.json") as player:
|
|
155
|
+
my_agent.run("Find flights to NYC under $300")
|
|
156
|
+
|
|
157
|
+
for d in player.divergences:
|
|
158
|
+
print(d["index"], d["expected"], "->", d["actual"])
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Inspecting cassettes
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from agentcassette import Cassette
|
|
165
|
+
|
|
166
|
+
c = Cassette.load("cassettes/flight_search.json")
|
|
167
|
+
c.num_steps # number of intercepted calls
|
|
168
|
+
c.total_input_tokens # summed across steps
|
|
169
|
+
c.total_output_tokens
|
|
170
|
+
c.total_tokens
|
|
171
|
+
c.duration_ms # wall time of the original run
|
|
172
|
+
|
|
173
|
+
c.redact("api_key") # scrub secrets before committing to git
|
|
174
|
+
c.save("cassettes/flight_search.json")
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Token counts use exact usage blocks when the recorded response carries one (OpenAI `usage.prompt_tokens`, Anthropic `usage.input_tokens`, …), falling back to a deterministic ~4-chars-per-token heuristic otherwise.
|
|
178
|
+
|
|
179
|
+
## Redacting secrets
|
|
180
|
+
|
|
181
|
+
Scrub sensitive keys either when recording or after loading:
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
# At record time:
|
|
185
|
+
with record("cassettes/run.json", redact=["api_key", "authorization"]):
|
|
186
|
+
my_agent.run(task)
|
|
187
|
+
|
|
188
|
+
# Or later:
|
|
189
|
+
Cassette.load("cassettes/run.json").redact("api_key").save("cassettes/run.json")
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Diffing runs
|
|
193
|
+
|
|
194
|
+
Compare two cassettes to see how an agent's behavior drifted between versions:
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
from agentcassette import diff_cassettes
|
|
198
|
+
|
|
199
|
+
delta = diff_cassettes("cassettes/v1.json", "cassettes/v2.json")
|
|
200
|
+
delta.new_calls # call names in v2 but not v1
|
|
201
|
+
delta.dropped_calls # call names in v1 but not v2
|
|
202
|
+
delta.changed_calls # same-position steps whose args/results changed
|
|
203
|
+
delta.token_delta # total token change (v2 - v1)
|
|
204
|
+
delta.identical # True if nothing changed
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Cassette format
|
|
208
|
+
|
|
209
|
+
Cassettes are plain, human-readable JSON — diffable and safe to commit:
|
|
210
|
+
|
|
211
|
+
```json
|
|
212
|
+
{
|
|
213
|
+
"version": 1,
|
|
214
|
+
"recorded_at": "2026-06-30T12:00:00Z",
|
|
215
|
+
"model": "claude-sonnet-4-6",
|
|
216
|
+
"duration_ms": 1832.4,
|
|
217
|
+
"steps": [
|
|
218
|
+
{
|
|
219
|
+
"index": 0,
|
|
220
|
+
"type": "llm",
|
|
221
|
+
"name": "call_model",
|
|
222
|
+
"arguments": {"args": ["plan the task"], "kwargs": {}},
|
|
223
|
+
"result": {"text": "...", "usage": {"input_tokens": 420, "output_tokens": 88}},
|
|
224
|
+
"input_tokens": 420,
|
|
225
|
+
"output_tokens": 88,
|
|
226
|
+
"duration_ms": 512.0
|
|
227
|
+
}
|
|
228
|
+
]
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Every intercepted call becomes one step, in the exact order it happened.
|
|
233
|
+
|
|
234
|
+
## API Reference
|
|
235
|
+
|
|
236
|
+
### `intercept(fn=None, *, name=None, kind="call")`
|
|
237
|
+
|
|
238
|
+
Marks a callable as recordable/replayable. Usable as `intercept(fn)`, `intercept(fn, kind="llm")`, or as a decorator. Works on both regular functions and `async def` coroutine functions (async callables get an awaitable wrapper). `kind` is a free-form label stored on each step (e.g. `"llm"`, `"tool"`). Outside a session, the wrapped callable is a transparent pass-through.
|
|
239
|
+
|
|
240
|
+
### `record(path, *, model=None, redact=None)`
|
|
241
|
+
|
|
242
|
+
Context manager. Records every intercepted call made inside the block to `path`, written on clean exit only. `redact` is a list of key names to scrub before saving. Yields the `Recorder`.
|
|
243
|
+
|
|
244
|
+
### `replay(path, *, strict=False)`
|
|
245
|
+
|
|
246
|
+
Context manager. Serves recorded results for intercepted calls without running the real functions. `strict=True` raises `DivergenceError` on any mismatch. Yields the `Player` (with `.divergences`, `.remaining`, `.cursor`).
|
|
247
|
+
|
|
248
|
+
### `Cassette`
|
|
249
|
+
|
|
250
|
+
| Member | Description |
|
|
251
|
+
|---|---|
|
|
252
|
+
| `Cassette.load(path)` | Load from disk (raises `CassetteNotFound`) |
|
|
253
|
+
| `.save(path)` | Write pretty-printed JSON, creating parent dirs |
|
|
254
|
+
| `.num_steps` | Number of recorded steps |
|
|
255
|
+
| `.total_input_tokens` / `.total_output_tokens` / `.total_tokens` | Token totals |
|
|
256
|
+
| `.duration_ms` | Wall time of the recorded run |
|
|
257
|
+
| `.redact(key, replacement="****")` | Scrub every value under `key`, at any depth |
|
|
258
|
+
|
|
259
|
+
### `diff_cassettes(a, b) -> CassetteDiff`
|
|
260
|
+
|
|
261
|
+
Compare two cassettes (paths or `Cassette` objects). Returns a `CassetteDiff` with `new_calls`, `dropped_calls`, `changed_calls`, `token_delta`, `input_token_delta`, `output_token_delta`, `step_delta`, and `identical`.
|
|
262
|
+
|
|
263
|
+
### Exceptions
|
|
264
|
+
|
|
265
|
+
All inherit from `AgentCassetteError`:
|
|
266
|
+
|
|
267
|
+
| Exception | Raised when |
|
|
268
|
+
|---|---|
|
|
269
|
+
| `CassetteNotFound` | Replaying a path that doesn't exist |
|
|
270
|
+
| `ReplayExhausted` | The agent makes more calls than the cassette recorded |
|
|
271
|
+
| `DivergenceError` | A strict replay sees a call that differs from the recording |
|
|
272
|
+
|
|
273
|
+
## Notes & limitations
|
|
274
|
+
|
|
275
|
+
- **Replayed results are JSON.** Recorded values round-trip through JSON, so on replay you get plain dicts/lists/primitives, not the original SDK objects. For typical LLM responses (dicts) this is exactly what you want.
|
|
276
|
+
- **Ordering matters.** Calls replay in the order they were recorded. agentcassette matches sequentially, which is deterministic and mirrors how an agent actually executes. Truly concurrent calls (e.g. `asyncio.gather`) are recorded in completion order; if that order isn't stable across runs, replay matching is best-effort — record such sections sequentially if you need strict determinism.
|
|
277
|
+
- **Sync and async.** Both `def` and `async def` callables are supported. `record`/`replay` are thread-local and cover the event loop running on that thread; wrap per-thread if your agent fans out across OS threads.
|
|
278
|
+
- **Streaming responses** (token iterators) are not specially handled yet — wrap at a boundary where the response is already materialized.
|
|
279
|
+
|
|
280
|
+
## Contributing
|
|
281
|
+
|
|
282
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
283
|
+
|
|
284
|
+
## License
|
|
285
|
+
|
|
286
|
+
MIT — see [LICENSE](LICENSE).
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
Part of the [aenealabs](https://github.com/aenealabs) AI agent toolkit.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
agentcassette/__init__.py,sha256=WdBqiRG9OfsoSWGGBzAYVg5ik9SxmhNRRc5ClpzpggU,2513
|
|
2
|
+
agentcassette/_cassette.py,sha256=SEk9cd5f5BR8YNJTkWU512yBIiAAOsd4G0iZBJOr5jU,5779
|
|
3
|
+
agentcassette/_diff.py,sha256=_mcod434ty-O3m-AG1GQEETVGx4pmXq7-ETo9mi3PIA,2707
|
|
4
|
+
agentcassette/_errors.py,sha256=tsI34kUVRvd8GDKc6-kdnXkal5jpUh0bD6fjsopTfD8,1217
|
|
5
|
+
agentcassette/_session.py,sha256=sr9kxFxVRO2SVs2rqiQ34wp_c1DGzTJk8kA55YpU0Kk,9017
|
|
6
|
+
agentcassette/_tokens.py,sha256=x3S2BEaa0YYkE6gouTmpPKEWuJdTAqrLEb0PCWLLceE,2985
|
|
7
|
+
agentcassette/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
agentcassette-0.1.0.dist-info/METADATA,sha256=RCFPSZDRK12pp_8hC2lVxY_BQa-hwRn-M3QHfQXjnAk,11418
|
|
9
|
+
agentcassette-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
10
|
+
agentcassette-0.1.0.dist-info/licenses/LICENSE,sha256=MYtxdpJgHsIGUqnc1UYrqDff94Vn7SDKXTxLhqOyCi8,1071
|
|
11
|
+
agentcassette-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 LaVon Rutledge
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|