cassette-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ node_modules/
2
+ dist/
3
+ .wrangler/
4
+ *.log
5
+ .dev.vars
6
+ .env
7
+ __pycache__/
8
+ *.egg-info/
9
+ .venv/
@@ -0,0 +1,57 @@
1
+ Metadata-Version: 2.4
2
+ Name: cassette-sdk
3
+ Version: 0.1.0
4
+ Summary: One-call shim to route LLM SDK traffic through the Cassette record/replay gateway
5
+ Project-URL: Homepage, https://github.com/NOVUS-STUDIOS-DEV/cassette
6
+ Project-URL: Source, https://github.com/NOVUS-STUDIOS-DEV/cassette
7
+ Author: Cassette
8
+ License: MIT
9
+ Keywords: agent,anthropic,ci,llm,openai,record,replay,testing,vcr
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Software Development :: Testing
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: httpx>=0.24
17
+ Description-Content-Type: text/markdown
18
+
19
+ # cassette-sdk
20
+
21
+ Record/replay LLM & agent API calls so your tests run **fast, free, and deterministic**.
22
+
23
+ The first time your tests run, Cassette records each LLM response to a local file. After that it
24
+ replays the saved response — no network, no API key, no token cost, and no random failures from the
25
+ model wording things differently.
26
+
27
+ ## Install
28
+
29
+ ```bash
30
+ pip install cassette-sdk
31
+ ```
32
+
33
+ ## Use (in-process, no gateway)
34
+
35
+ ```python
36
+ from cassette.recorder import http_client
37
+ from openai import OpenAI
38
+
39
+ client = OpenAI(http_client=http_client(project="demo")) # records → replays locally
40
+ ```
41
+
42
+ Modes via `CASSETTE_MODE`: `record` | `replay` | `auto` (default). Cassettes are plain JSON in
43
+ `./.cassettes` and diff cleanly in PRs.
44
+
45
+ ## Detect real regressions
46
+
47
+ ```python
48
+ from cassette.drift import compare_cassette_files
49
+ result = compare_cassette_files("baseline.json", "new.json")
50
+ print(result.verdict) # identical | benign | regression
51
+ ```
52
+
53
+ It ignores harmless rewording but flags changed tool calls, structured-output shape changes, and
54
+ truncation as regressions.
55
+
56
+ Free and open source (MIT). Team features (shared registry + GitHub PR merge-gate) at
57
+ [cassette.dev](https://cassette.dev).
@@ -0,0 +1,39 @@
1
+ # cassette-sdk
2
+
3
+ Record/replay LLM & agent API calls so your tests run **fast, free, and deterministic**.
4
+
5
+ The first time your tests run, Cassette records each LLM response to a local file. After that it
6
+ replays the saved response — no network, no API key, no token cost, and no random failures from the
7
+ model wording things differently.
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install cassette-sdk
13
+ ```
14
+
15
+ ## Use (in-process, no gateway)
16
+
17
+ ```python
18
+ from cassette.recorder import http_client
19
+ from openai import OpenAI
20
+
21
+ client = OpenAI(http_client=http_client(project="demo")) # records → replays locally
22
+ ```
23
+
24
+ Modes via `CASSETTE_MODE`: `record` | `replay` | `auto` (default). Cassettes are plain JSON in
25
+ `./.cassettes` and diff cleanly in PRs.
26
+
27
+ ## Detect real regressions
28
+
29
+ ```python
30
+ from cassette.drift import compare_cassette_files
31
+ result = compare_cassette_files("baseline.json", "new.json")
32
+ print(result.verdict) # identical | benign | regression
33
+ ```
34
+
35
+ It ignores harmless rewording but flags changed tool calls, structured-output shape changes, and
36
+ truncation as regressions.
37
+
38
+ Free and open source (MIT). Team features (shared registry + GitHub PR merge-gate) at
39
+ [cassette.dev](https://cassette.dev).
@@ -0,0 +1,51 @@
1
+ """Cassette — optional one-call Python shim.
2
+
3
+ You don't strictly need this: setting OPENAI_BASE_URL / ANTHROPIC_BASE_URL to the gateway URL is
4
+ enough. This wrapper just composes those URLs from CASSETTE_* env vars and covers every provider at
5
+ once, so a single `cassette.use()` call wires the whole test process.
6
+
7
+ import cassette
8
+ cassette.use() # reads CASSETTE_GATEWAY / CASSETTE_PROJECT / CASSETTE_MODE
9
+ # ...all OpenAI/Anthropic/Google SDK calls now route through the gateway
10
+
11
+ Env vars:
12
+ CASSETTE_GATEWAY default http://localhost:8787
13
+ CASSETTE_PROJECT default "default"
14
+ CASSETTE_MODE record | replay | auto (default "auto")
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import os
19
+
20
+ __all__ = ["use", "base_url"]
21
+
22
+ _PROVIDER_SUFFIX = {
23
+ "openai": "/openai/v1",
24
+ "anthropic": "/anthropic",
25
+ "google": "/google",
26
+ }
27
+
28
+
29
+ def base_url(provider: str, *, gateway: str | None = None, project: str | None = None,
30
+ mode: str | None = None) -> str:
31
+ """Return the gateway base URL for a provider, e.g. for manual client construction."""
32
+ gateway = (gateway or os.environ.get("CASSETTE_GATEWAY", "http://localhost:8787")).rstrip("/")
33
+ project = project or os.environ.get("CASSETTE_PROJECT", "default")
34
+ mode = mode or os.environ.get("CASSETTE_MODE", "auto")
35
+ if provider not in _PROVIDER_SUFFIX:
36
+ raise ValueError(f"unknown provider {provider!r}; known: {list(_PROVIDER_SUFFIX)}")
37
+ return f"{gateway}/{project}/{mode}{_PROVIDER_SUFFIX[provider]}"
38
+
39
+
40
+ def use(*, gateway: str | None = None, project: str | None = None, mode: str | None = None) -> dict:
41
+ """Point the standard SDK base-URL env vars at the Cassette gateway.
42
+
43
+ Returns the mapping it set, for logging/inspection. Idempotent.
44
+ """
45
+ env_map = {
46
+ "OPENAI_BASE_URL": base_url("openai", gateway=gateway, project=project, mode=mode),
47
+ "ANTHROPIC_BASE_URL": base_url("anthropic", gateway=gateway, project=project, mode=mode),
48
+ "GOOGLE_GEMINI_BASE_URL": base_url("google", gateway=gateway, project=project, mode=mode),
49
+ }
50
+ os.environ.update(env_map)
51
+ return env_map
@@ -0,0 +1,174 @@
1
+ """Drift detection — the "did it REALLY break?" engine. This is the moat.
2
+
3
+ A byte-diff is useless for LLM output: the model rewording an answer is NOT a regression, but a
4
+ changed tool call, a changed JSON shape, or a truncated response IS. This module classifies the
5
+ difference between two recorded responses into:
6
+
7
+ IDENTICAL — same bytes
8
+ BENIGN — only free-text wording changed; structure & behavior identical (non-determinism)
9
+ REGRESSION — behavior changed: tool calls, structured output shape, or stop reason differ
10
+
11
+ It is provider-aware (OpenAI + Anthropic chat shapes) and falls back to a generic JSON/text diff.
12
+ An optional `semantic_judge` hook can UPGRADE a benign text change to a regression when the meaning
13
+ diverges (embedding distance or an LLM judge) — that hook, trained on the cross-org corpus, is the
14
+ part competitors can't cheaply copy. Default behavior is conservative and deterministic.
15
+
16
+ Design rule (fail safe): when unsure whether a text-only change matters, default to BENIGN so the
17
+ gate doesn't cry wolf — but ALWAYS surface the diff so a human can bless or reject it. The gate
18
+ informs; it never silently decides.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ from dataclasses import dataclass, field
24
+ from enum import Enum
25
+ from typing import Callable, Optional
26
+
27
+
28
+ class Verdict(str, Enum):
29
+ IDENTICAL = "identical"
30
+ BENIGN = "benign" # wording-only drift; behavior unchanged
31
+ REGRESSION = "regression" # behavior changed — block the merge pending review
32
+
33
+
34
+ @dataclass
35
+ class DriftResult:
36
+ verdict: Verdict
37
+ reasons: list[str] = field(default_factory=list)
38
+ # structured behavioral signals, useful for the PR-check diff UI
39
+ tool_calls_changed: bool = False
40
+ structure_changed: bool = False
41
+ stop_reason_changed: bool = False
42
+ text_changed: bool = False
43
+
44
+ @property
45
+ def is_regression(self) -> bool:
46
+ return self.verdict == Verdict.REGRESSION
47
+
48
+
49
+ # A semantic judge takes (old_text, new_text) and returns True if the MEANING changed materially.
50
+ SemanticJudge = Callable[[str, str], bool]
51
+
52
+
53
+ # --- provider-aware extraction of the BEHAVIORAL signal from a response body ---
54
+
55
+ @dataclass
56
+ class Behavior:
57
+ text: str
58
+ tool_calls: list # normalized [{name, arguments}]
59
+ stop_reason: Optional[str]
60
+ structured: Optional[object] # parsed JSON if the text itself is JSON (structured output)
61
+
62
+
63
+ def _normalize_tool_call(name: str, arguments) -> dict:
64
+ # arguments may be a JSON string (OpenAI) or a dict (Anthropic input) — canonicalize to a dict
65
+ if isinstance(arguments, str):
66
+ try:
67
+ arguments = json.loads(arguments)
68
+ except (ValueError, TypeError):
69
+ pass
70
+ return {"name": name, "arguments": arguments}
71
+
72
+
73
+ def extract_behavior(body: str, provider: str = "") -> Behavior:
74
+ try:
75
+ data = json.loads(body)
76
+ except (ValueError, TypeError):
77
+ return Behavior(text=body or "", tool_calls=[], stop_reason=None, structured=None)
78
+
79
+ text, tool_calls, stop = "", [], None
80
+
81
+ # OpenAI chat completions
82
+ if isinstance(data, dict) and "choices" in data:
83
+ choice = (data.get("choices") or [{}])[0]
84
+ msg = choice.get("message", {}) if isinstance(choice, dict) else {}
85
+ text = msg.get("content") or ""
86
+ for tc in msg.get("tool_calls") or []:
87
+ fn = tc.get("function", {})
88
+ tool_calls.append(_normalize_tool_call(fn.get("name", ""), fn.get("arguments")))
89
+ stop = choice.get("finish_reason")
90
+
91
+ # Anthropic messages
92
+ elif isinstance(data, dict) and "content" in data and isinstance(data["content"], list):
93
+ parts = []
94
+ for block in data["content"]:
95
+ if block.get("type") == "text":
96
+ parts.append(block.get("text", ""))
97
+ elif block.get("type") == "tool_use":
98
+ tool_calls.append(_normalize_tool_call(block.get("name", ""), block.get("input")))
99
+ text = "".join(parts)
100
+ stop = data.get("stop_reason")
101
+
102
+ # generic fallback: treat the whole JSON as the structured payload
103
+ else:
104
+ return Behavior(text=body, tool_calls=[], stop_reason=None, structured=data)
105
+
106
+ structured = None
107
+ if text:
108
+ try:
109
+ structured = json.loads(text) # the model was asked for JSON output
110
+ except (ValueError, TypeError):
111
+ pass
112
+ return Behavior(text=text, tool_calls=tool_calls, stop_reason=stop, structured=structured)
113
+
114
+
115
+ def _shape(value: object) -> object:
116
+ """Recursive type/key skeleton of a JSON value (ignores leaf values)."""
117
+ if isinstance(value, dict):
118
+ return {k: _shape(value[k]) for k in sorted(value)}
119
+ if isinstance(value, list):
120
+ return ["<list>"] if not value else [_shape(value[0])]
121
+ return type(value).__name__
122
+
123
+
124
+ def compare(old_body: str, new_body: str, *, provider: str = "",
125
+ semantic_judge: Optional[SemanticJudge] = None) -> DriftResult:
126
+ """Classify the drift between two recorded response bodies."""
127
+ if old_body == new_body:
128
+ return DriftResult(Verdict.IDENTICAL, ["byte-identical"])
129
+
130
+ a, b = extract_behavior(old_body, provider), extract_behavior(new_body, provider)
131
+ res = DriftResult(Verdict.BENIGN)
132
+
133
+ # 1) tool calls — the strongest behavioral signal
134
+ if a.tool_calls != b.tool_calls:
135
+ res.tool_calls_changed = True
136
+ res.verdict = Verdict.REGRESSION
137
+ res.reasons.append(f"tool calls changed: {a.tool_calls!r} -> {b.tool_calls!r}")
138
+
139
+ # 2) structured-output shape (the model was asked for JSON)
140
+ if a.structured is not None or b.structured is not None:
141
+ if _shape(a.structured) != _shape(b.structured):
142
+ res.structure_changed = True
143
+ res.verdict = Verdict.REGRESSION
144
+ res.reasons.append("structured-output shape changed")
145
+
146
+ # 3) stop / finish reason (e.g. 'stop' -> 'length' means truncation)
147
+ if a.stop_reason != b.stop_reason:
148
+ res.stop_reason_changed = True
149
+ res.verdict = Verdict.REGRESSION
150
+ res.reasons.append(f"stop reason changed: {a.stop_reason} -> {b.stop_reason}")
151
+
152
+ # 4) free text — benign by default (non-determinism), unless a semantic judge disagrees
153
+ if a.text != b.text:
154
+ res.text_changed = True
155
+ if res.verdict != Verdict.REGRESSION:
156
+ if semantic_judge is not None and semantic_judge(a.text, b.text):
157
+ res.verdict = Verdict.REGRESSION
158
+ res.reasons.append("semantic judge: answer meaning changed materially")
159
+ else:
160
+ res.reasons.append("free-text wording changed (treated as benign non-determinism)")
161
+
162
+ if not res.reasons:
163
+ res.reasons.append("non-behavioral difference only")
164
+ return res
165
+
166
+
167
+ def compare_cassette_files(path_a: str, path_b: str,
168
+ semantic_judge: Optional[SemanticJudge] = None) -> DriftResult:
169
+ """Compare two cassette JSON files (the blessed baseline vs a PR's recording)."""
170
+ with open(path_a) as fa, open(path_b) as fb:
171
+ ca, cb = json.load(fa), json.load(fb)
172
+ provider = ca.get("request", {}).get("provider", "")
173
+ return compare(ca["response"]["body"], cb["response"]["body"],
174
+ provider=provider, semantic_judge=semantic_judge)
@@ -0,0 +1,115 @@
1
+ """In-process recorder — the recording surface Cassette OWNS (no gateway required).
2
+
3
+ This is the strategic core after the moat inversion: recording must NOT depend on routing traffic
4
+ through a gateway, because a gateway VCR is a commodity an incumbent can bundle and a customer can
5
+ self-host. Here we record at the HTTP-client layer inside the test process itself.
6
+
7
+ from cassette.recorder import http_client
8
+ from openai import OpenAI
9
+
10
+ client = OpenAI(http_client=http_client(project="demo")) # records/replays locally
11
+ # ...calls now hit local cassettes; no gateway, no extra infra in CI
12
+
13
+ Modes (env CASSETTE_MODE): record | replay | auto (default auto).
14
+ Local cassette dir (env CASSETTE_DIR): default ./.cassettes
15
+
16
+ The local cassette files conform to SPEC.md (the portable ".har of agent test traffic"). The hosted
17
+ backend (shared registry, RBAC, GitHub Checks merge-gate, semantic-drift matcher) is a CLIENT-SERVER
18
+ layer ON TOP of this format — never a precondition for recording. That boundary is the whole moat.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import hashlib
23
+ import json
24
+ import os
25
+ from pathlib import Path
26
+ from typing import Optional
27
+
28
+ try:
29
+ import httpx
30
+ except ImportError: # pragma: no cover
31
+ httpx = None # type: ignore
32
+
33
+ _VOLATILE = {"stream_options", "user", "metadata"}
34
+ _DROP_RESP_HEADERS = {"content-length", "content-encoding", "transfer-encoding", "connection"}
35
+
36
+
37
+ class CassetteMiss(RuntimeError):
38
+ """Raised in replay mode when no cassette exists for a request (fails SAFE, never silently)."""
39
+
40
+
41
+ def _canonical(value: object) -> str:
42
+ if value is None or not isinstance(value, (dict, list)):
43
+ return json.dumps(value)
44
+ if isinstance(value, list):
45
+ return "[" + ",".join(_canonical(v) for v in value) + "]"
46
+ keys = sorted(k for k in value if k not in _VOLATILE)
47
+ return "{" + ",".join(json.dumps(k) + ":" + _canonical(value[k]) for k in keys) + "}"
48
+
49
+
50
+ def fingerprint(method: str, url: str, body: str) -> str:
51
+ norm = body
52
+ if body:
53
+ try:
54
+ norm = _canonical(json.loads(body))
55
+ except (ValueError, TypeError):
56
+ pass
57
+ return hashlib.sha256("\n".join([method.upper(), url, norm]).encode()).hexdigest()
58
+
59
+
60
+ if httpx is not None:
61
+
62
+ class CassetteTransport(httpx.BaseTransport):
63
+ """An httpx transport that records to / replays from local cassette files."""
64
+
65
+ def __init__(self, inner: httpx.BaseTransport, cassette_dir: Path, mode: str):
66
+ self._inner = inner
67
+ self._dir = cassette_dir
68
+ self._mode = mode
69
+ self._dir.mkdir(parents=True, exist_ok=True)
70
+
71
+ def handle_request(self, request: "httpx.Request") -> "httpx.Response":
72
+ body = request.content.decode("utf-8", "ignore")
73
+ fp = fingerprint(request.method, str(request.url), body)
74
+ path = self._dir / f"{fp}.json"
75
+
76
+ if self._mode in ("replay", "auto") and path.exists():
77
+ rec = json.loads(path.read_text())
78
+ headers = {k: v for k, v in rec["response"]["headers"].items()
79
+ if k.lower() not in _DROP_RESP_HEADERS}
80
+ headers["x-cassette"] = "replay"
81
+ return httpx.Response(
82
+ rec["response"]["status"], headers=headers,
83
+ content=rec["response"]["body"].encode(), request=request,
84
+ )
85
+ if self._mode == "replay":
86
+ raise CassetteMiss(f"no cassette for {request.method} {request.url} (fp={fp[:12]})")
87
+
88
+ # record (or auto-miss → fails SAFE: just records)
89
+ resp = self._inner.handle_request(request)
90
+ content = resp.read()
91
+ if resp.status_code < 400:
92
+ path.write_text(json.dumps({
93
+ "v": 1,
94
+ "fingerprint": fp,
95
+ "request": {"method": request.method, "url": str(request.url), "body": body},
96
+ "response": {
97
+ "status": resp.status_code,
98
+ "headers": dict(resp.headers),
99
+ "body": content.decode("utf-8", "ignore"),
100
+ },
101
+ }, indent=2))
102
+ return httpx.Response(resp.status_code, headers=resp.headers, content=content,
103
+ request=request)
104
+
105
+
106
+ def http_client(*, project: Optional[str] = None, mode: Optional[str] = None,
107
+ cassette_dir: Optional[str] = None) -> "httpx.Client":
108
+ """Build an httpx.Client that records/replays locally — pass to OpenAI(http_client=...)."""
109
+ if httpx is None:
110
+ raise RuntimeError("cassette.recorder requires httpx (pip install httpx)")
111
+ project = project or os.environ.get("CASSETTE_PROJECT", "default")
112
+ mode = mode or os.environ.get("CASSETTE_MODE", "auto")
113
+ base = Path(cassette_dir or os.environ.get("CASSETTE_DIR", ".cassettes")) / project
114
+ transport = CassetteTransport(httpx.HTTPTransport(), base, mode)
115
+ return httpx.Client(transport=transport)
@@ -0,0 +1,47 @@
1
+ """Registry client — sync local cassettes to/from the hosted team registry (the paid layer).
2
+
3
+ The OSS recorder works fully offline; this is the opt-in bridge that pushes a CI run's cassettes to
4
+ the shared registry so the GitHub merge-gate can compare them. Requires a seat token.
5
+
6
+ from cassette.registry import push_dir
7
+ push_dir(".cassettes/demo", project="acme/app", ref="pr-42", token="...")
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import os
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+ try:
17
+ import httpx
18
+ except ImportError: # pragma: no cover
19
+ httpx = None # type: ignore
20
+
21
+
22
+ def _base() -> str:
23
+ return os.environ.get("CASSETTE_BACKEND", "https://api.cassette.dev").rstrip("/")
24
+
25
+
26
+ def push_dir(cassette_dir: str, *, project: str, ref: str, token: Optional[str] = None) -> int:
27
+ """Upload every cassette in a local dir to the registry under <project>/<ref>. Returns count."""
28
+ if httpx is None:
29
+ raise RuntimeError("cassette.registry requires httpx")
30
+ token = token or os.environ.get("CASSETTE_TOKEN")
31
+ if not token:
32
+ raise RuntimeError("no seat token (set CASSETTE_TOKEN)")
33
+ items = []
34
+ for f in Path(cassette_dir).glob("*.json"):
35
+ rec = json.loads(f.read_text())
36
+ items.append({"fingerprint": rec.get("fingerprint", f.stem), "body": json.dumps(rec)})
37
+ if not items:
38
+ return 0
39
+ resp = httpx.post(
40
+ f"{_base()}/v1/{project}/cassettes",
41
+ params={"ref": ref},
42
+ headers={"authorization": f"Bearer {token}"},
43
+ json=items,
44
+ timeout=30,
45
+ )
46
+ resp.raise_for_status()
47
+ return int(resp.json().get("pushed", 0))
@@ -0,0 +1,28 @@
1
+ [project]
2
+ name = "cassette-sdk"
3
+ version = "0.1.0"
4
+ description = "One-call shim to route LLM SDK traffic through the Cassette record/replay gateway"
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ license = { text = "MIT" }
8
+ authors = [{ name = "Cassette" }]
9
+ keywords = ["llm", "agent", "testing", "record", "replay", "vcr", "openai", "anthropic", "ci"]
10
+ dependencies = ["httpx>=0.24"]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Developers",
14
+ "License :: OSI Approved :: MIT License",
15
+ "Programming Language :: Python :: 3",
16
+ "Topic :: Software Development :: Testing",
17
+ ]
18
+
19
+ [project.urls]
20
+ Homepage = "https://github.com/NOVUS-STUDIOS-DEV/cassette"
21
+ Source = "https://github.com/NOVUS-STUDIOS-DEV/cassette"
22
+
23
+ [build-system]
24
+ requires = ["hatchling"]
25
+ build-backend = "hatchling.build"
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["cassette"]