cachelens 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cachelens-1.0.0/.gitignore +10 -0
- cachelens-1.0.0/CHANGELOG.md +20 -0
- cachelens-1.0.0/CLAUDE.md +55 -0
- cachelens-1.0.0/CONTRIBUTING.md +38 -0
- cachelens-1.0.0/LICENSE +21 -0
- cachelens-1.0.0/PKG-INFO +121 -0
- cachelens-1.0.0/README.md +78 -0
- cachelens-1.0.0/cache_lens/__init__.py +18 -0
- cachelens-1.0.0/cache_lens/analyzer.py +254 -0
- cachelens-1.0.0/cache_lens/cli.py +43 -0
- cachelens-1.0.0/cache_lens/models.py +75 -0
- cachelens-1.0.0/cache_lens/outputs/__init__.py +1 -0
- cachelens-1.0.0/cache_lens/outputs/json_export.py +48 -0
- cachelens-1.0.0/cache_lens/outputs/otel.py +68 -0
- cachelens-1.0.0/cache_lens/outputs/terminal.py +67 -0
- cachelens-1.0.0/cache_lens/pricing.py +183 -0
- cachelens-1.0.0/cache_lens/providers/__init__.py +58 -0
- cachelens-1.0.0/cache_lens/providers/anthropic.py +54 -0
- cachelens-1.0.0/cache_lens/providers/gemini.py +79 -0
- cachelens-1.0.0/cache_lens/providers/openai.py +62 -0
- cachelens-1.0.0/cache_lens/wrapper.py +195 -0
- cachelens-1.0.0/docs/architecture.md +52 -0
- cachelens-1.0.0/docs/positioning.md +109 -0
- cachelens-1.0.0/examples/anthropic_basic.py +25 -0
- cachelens-1.0.0/examples/openai_basic.py +22 -0
- cachelens-1.0.0/examples/queryargus_demo.py +49 -0
- cachelens-1.0.0/pyproject.toml +61 -0
- cachelens-1.0.0/tests/conftest.py +35 -0
- cachelens-1.0.0/tests/fixtures/anthropic_responses.json +20 -0
- cachelens-1.0.0/tests/fixtures/gemini_responses.json +18 -0
- cachelens-1.0.0/tests/fixtures/litellm_pricing.json +20 -0
- cachelens-1.0.0/tests/fixtures/openai_responses.json +22 -0
- cachelens-1.0.0/tests/providers/test_anthropic.py +24 -0
- cachelens-1.0.0/tests/providers/test_gemini.py +24 -0
- cachelens-1.0.0/tests/providers/test_openai.py +44 -0
- cachelens-1.0.0/tests/test_analyzer.py +117 -0
- cachelens-1.0.0/tests/test_capture.py +86 -0
- cachelens-1.0.0/tests/test_pricing.py +69 -0
- cachelens-1.0.0/tests/test_wrapper.py +97 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [1.0.0] - 2026-06-09
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- Wrapper interception for Anthropic, Gemini, and OpenAI clients (`wrap`, `CacheLens`, `CacheLensClient`)
|
|
14
|
+
- Request capture: normalises prompt to ordered `PromptSegment` list per call
|
|
15
|
+
- Content-based layer classification via longest-common-prefix analysis (system_prompt / context / conversation layers)
|
|
16
|
+
- Terminal report with cache hit rate, cost, savings, and per-layer breakdown
|
|
17
|
+
- JSON export (`json_export=` arg or `CACHE_LENS_JSON` env var)
|
|
18
|
+
- OpenTelemetry metrics output (`otel=True`)
|
|
19
|
+
- Overridable pricing table (native dict, JSON file, or `CACHE_LENS_PRICING` env var; LiteLLM format auto-detected)
|
|
20
|
+
- Gemini support for modern `google-genai` SDK (`config` kwarg pattern)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# CLAUDE.md — cache-lens
|
|
2
|
+
|
|
3
|
+
Context for Claude Code sessions on this repo.
|
|
4
|
+
|
|
5
|
+
## What this is
|
|
6
|
+
|
|
7
|
+
A Python library that instruments prompt caching in LLM API apps (Anthropic,
|
|
8
|
+
Gemini, OpenAI). You wrap a provider client; on each intercepted call it captures
|
|
9
|
+
both the **request prompt** (normalised to ordered `PromptSegment`s) and the
|
|
10
|
+
**response cache metrics**. At session end the analyzer does content-based layer
|
|
11
|
+
classification and produces a `SessionReport` rendered to terminal / JSON / OTEL.
|
|
12
|
+
Full design in [SPEC.md](SPEC.md).
|
|
13
|
+
|
|
14
|
+
The differentiator (see [docs/positioning.md](docs/positioning.md)): it doesn't
|
|
15
|
+
just report cached-token counts (LiteLLM/Langfuse/Helicone already do that) — it
|
|
16
|
+
diffs the prompt prefix across calls to name *which layer* is stable-but-uncached
|
|
17
|
+
and what restructuring would save.
|
|
18
|
+
|
|
19
|
+
## Layout
|
|
20
|
+
|
|
21
|
+
- `cache_lens/wrapper.py` — interception (`wrap`, `CacheLens`, `CacheLensClient`);
|
|
22
|
+
stores `List[CallCapture]` (request segments + response metrics) per session
|
|
23
|
+
- `cache_lens/providers/{anthropic,gemini,openai}.py` — `extract()` (response →
|
|
24
|
+
`RawCallMetrics`) and `capture()` (request → `List[PromptSegment]`)
|
|
25
|
+
- `cache_lens/analyzer.py` — longest-common-prefix layer classification, cost,
|
|
26
|
+
savings, ceiling, content-aware tips
|
|
27
|
+
- `cache_lens/models.py` — dataclasses (`RawCallMetrics`, `PromptSegment`,
|
|
28
|
+
`CallCapture`, `LayerReport`, `SessionReport`)
|
|
29
|
+
- `cache_lens/pricing.py` — price registry (USD per 1M tokens): bundled
|
|
30
|
+
`DEFAULT_PRICING` + runtime overrides via `CACHE_LENS_PRICING` env var or
|
|
31
|
+
`pricing=` arg (native or LiteLLM JSON, auto-detected, merged over defaults)
|
|
32
|
+
- `cache_lens/outputs/{terminal,json_export,otel}.py` — sinks
|
|
33
|
+
- `cache_lens/cli.py` — `cache-lens run` (scaffolded, not implemented)
|
|
34
|
+
- `tests/` — pytest; fixtures are JSON, loaded via `tests/conftest.py`
|
|
35
|
+
|
|
36
|
+
## Conventions
|
|
37
|
+
|
|
38
|
+
- Never let instrumentation break the wrapped caller — both `capture()` and
|
|
39
|
+
`extract()` are wrapped in try/except in `wrapper._wrap_call`; capture failure
|
|
40
|
+
yields empty segments (analyzer degrades to no layer diagnosis, aggregates still
|
|
41
|
+
exact).
|
|
42
|
+
- Provider SDKs and OTEL are optional deps; import them lazily inside functions,
|
|
43
|
+
not at module top level.
|
|
44
|
+
- Pricing/cost is per-token internally (`pricing.rate`), table is per-1M.
|
|
45
|
+
- Overall session aggregates (cost, savings, hit rate) are computed **exactly**
|
|
46
|
+
from response metrics; per-layer token splits are **estimated** by char-share
|
|
47
|
+
scaled to the real `input_tokens` (no tokenizer dep).
|
|
48
|
+
- Cost model: actual = miss·input + creation·cache_write + read·cache_read +
|
|
49
|
+
output·output; cold = all input at full input rate.
|
|
50
|
+
|
|
51
|
+
## Run tests
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -e .[dev] && pytest
|
|
55
|
+
```
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Contributing to cachelens
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in contributing!
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
git clone https://github.com/ChingEnLin/CacheLens.git
|
|
9
|
+
cd CacheLens
|
|
10
|
+
pip install -e .[dev]
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Running tests
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pytest
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
All tests must pass before opening a PR.
|
|
20
|
+
|
|
21
|
+
## Adding a provider
|
|
22
|
+
|
|
23
|
+
1. Create `cache_lens/providers/<name>.py` with `extract(response) -> RawCallMetrics` and `capture(request, client) -> List[PromptSegment]`
|
|
24
|
+
2. Register it in `cache_lens/wrapper.py` (`_detect_provider`)
|
|
25
|
+
3. Add tests in `tests/providers/test_<name>.py`
|
|
26
|
+
|
|
27
|
+
## Pull requests
|
|
28
|
+
|
|
29
|
+
- Keep PRs focused — one feature or fix per PR
|
|
30
|
+
- Include tests for new behavior
|
|
31
|
+
- Update `CHANGELOG.md` under `[Unreleased]`
|
|
32
|
+
|
|
33
|
+
## Reporting issues
|
|
34
|
+
|
|
35
|
+
Open an issue at https://github.com/ChingEnLin/CacheLens/issues with:
|
|
36
|
+
- Python version and OS
|
|
37
|
+
- Provider SDK version
|
|
38
|
+
- Minimal reproducer
|
cachelens-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ching En Lin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
cachelens-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: cachelens
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Non-invasive prompt cache instrumentation for LLM API apps
|
|
5
|
+
Project-URL: Homepage, https://github.com/ChingEnLin/CacheLens
|
|
6
|
+
Project-URL: Repository, https://github.com/ChingEnLin/CacheLens
|
|
7
|
+
Project-URL: Issues, https://github.com/ChingEnLin/CacheLens/issues
|
|
8
|
+
Author-email: Ching En Lin <chingenlin71@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: anthropic,gemini,llm,observability,otel,prompt-caching
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Requires-Dist: rich>=13.0
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: anthropic>=0.40; extra == 'all'
|
|
27
|
+
Requires-Dist: google-generativeai>=0.8; extra == 'all'
|
|
28
|
+
Requires-Dist: openai>=1.40; extra == 'all'
|
|
29
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc>=1.20; extra == 'all'
|
|
30
|
+
Requires-Dist: opentelemetry-sdk>=1.20; extra == 'all'
|
|
31
|
+
Provides-Extra: anthropic
|
|
32
|
+
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
35
|
+
Provides-Extra: gemini
|
|
36
|
+
Requires-Dist: google-generativeai>=0.8; extra == 'gemini'
|
|
37
|
+
Provides-Extra: openai
|
|
38
|
+
Requires-Dist: openai>=1.40; extra == 'openai'
|
|
39
|
+
Provides-Extra: otel
|
|
40
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc>=1.20; extra == 'otel'
|
|
41
|
+
Requires-Dist: opentelemetry-sdk>=1.20; extra == 'otel'
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+
# cache-lens
|
|
45
|
+
|
|
46
|
+
> Non-invasive prompt cache instrumentation for LLM API apps.
|
|
47
|
+
> Wrap your client in one line. Get terminal reports, JSON exports, and OTEL metrics.
|
|
48
|
+
|
|
49
|
+
Prompt caching gives steep discounts on cached tokens — but nothing tells you
|
|
50
|
+
whether your app is actually getting cache hits, or why not. cache-lens wraps
|
|
51
|
+
your Anthropic, Gemini, or OpenAI client and reports cache hit rate, cost,
|
|
52
|
+
savings, and the money you're leaving on the table, broken down by prompt layer.
|
|
53
|
+
|
|
54
|
+
See [SPEC.md](SPEC.md) for the full design.
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install cache-lens # core + rich
|
|
60
|
+
pip install cache-lens[anthropic] # + Anthropic SDK
|
|
61
|
+
pip install cache-lens[gemini] # + Gemini SDK
|
|
62
|
+
pip install cache-lens[openai] # + OpenAI SDK
|
|
63
|
+
pip install cache-lens[otel] # + OpenTelemetry
|
|
64
|
+
pip install cache-lens[all] # everything
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Quickstart
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import anthropic
|
|
71
|
+
from cache_lens import wrap
|
|
72
|
+
|
|
73
|
+
client = wrap(anthropic.Anthropic())
|
|
74
|
+
# ... use client exactly as before; report prints on exit
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Explicit session boundary with exports:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from cache_lens import CacheLens
|
|
81
|
+
|
|
82
|
+
with CacheLens(client, json_export="report.json", otel=True) as session:
|
|
83
|
+
agent.run(...) # your code, unchanged
|
|
84
|
+
report = session.report
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Suppress the terminal report in CI with `CACHE_LENS_TERMINAL=0`.
|
|
88
|
+
|
|
89
|
+
## Custom pricing
|
|
90
|
+
|
|
91
|
+
cache-lens ships a default price table, but you can override or extend it without
|
|
92
|
+
forking — handy when a new model lands. User entries merge over the defaults:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# in-memory dict (native format, USD per 1M tokens)
|
|
96
|
+
wrap(client, pricing={"openai": {"gpt-5": {"input": 1.25, "output": 10.0, "cache_read": 0.125}}})
|
|
97
|
+
|
|
98
|
+
# or a JSON file (native or LiteLLM model_prices_and_context_window.json format)
|
|
99
|
+
wrap(client, pricing="pricing.json")
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Or point at a file process-wide with `CACHE_LENS_PRICING=/path/to/pricing.json`.
|
|
103
|
+
A bad pricing file falls back to defaults rather than breaking the run. See
|
|
104
|
+
[SPEC.md §12](SPEC.md#12-pricing-table).
|
|
105
|
+
|
|
106
|
+
## Status
|
|
107
|
+
|
|
108
|
+
v1.0. Implemented: wrapper interception with **request capture**, provider
|
|
109
|
+
extraction + capture (Anthropic + Gemini + OpenAI), **content-based layer
|
|
110
|
+
classification** (longest-common-prefix → named system_prompt / context /
|
|
111
|
+
conversation layers, cross-referenced against actual cache reads),
|
|
112
|
+
terminal/JSON/OTEL outputs, overridable pricing, tests.
|
|
113
|
+
Pending: `cache-lens run` CLI injection, streaming support, and cross-run
|
|
114
|
+
static/semi-static separation (see [docs/architecture.md](docs/architecture.md)).
|
|
115
|
+
|
|
116
|
+
## Develop
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
pip install -e .[dev]
|
|
120
|
+
pytest
|
|
121
|
+
```
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# cache-lens
|
|
2
|
+
|
|
3
|
+
> Non-invasive prompt cache instrumentation for LLM API apps.
|
|
4
|
+
> Wrap your client in one line. Get terminal reports, JSON exports, and OTEL metrics.
|
|
5
|
+
|
|
6
|
+
Prompt caching gives steep discounts on cached tokens — but nothing tells you
|
|
7
|
+
whether your app is actually getting cache hits, or why not. cache-lens wraps
|
|
8
|
+
your Anthropic, Gemini, or OpenAI client and reports cache hit rate, cost,
|
|
9
|
+
savings, and the money you're leaving on the table, broken down by prompt layer.
|
|
10
|
+
|
|
11
|
+
See [SPEC.md](SPEC.md) for the full design.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install cache-lens # core + rich
|
|
17
|
+
pip install cache-lens[anthropic] # + Anthropic SDK
|
|
18
|
+
pip install cache-lens[gemini] # + Gemini SDK
|
|
19
|
+
pip install cache-lens[openai] # + OpenAI SDK
|
|
20
|
+
pip install cache-lens[otel] # + OpenTelemetry
|
|
21
|
+
pip install cache-lens[all] # everything
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quickstart
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import anthropic
|
|
28
|
+
from cache_lens import wrap
|
|
29
|
+
|
|
30
|
+
client = wrap(anthropic.Anthropic())
|
|
31
|
+
# ... use client exactly as before; report prints on exit
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Explicit session boundary with exports:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from cache_lens import CacheLens
|
|
38
|
+
|
|
39
|
+
with CacheLens(client, json_export="report.json", otel=True) as session:
|
|
40
|
+
agent.run(...) # your code, unchanged
|
|
41
|
+
report = session.report
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Suppress the terminal report in CI with `CACHE_LENS_TERMINAL=0`.
|
|
45
|
+
|
|
46
|
+
## Custom pricing
|
|
47
|
+
|
|
48
|
+
cache-lens ships a default price table, but you can override or extend it without
|
|
49
|
+
forking — handy when a new model lands. User entries merge over the defaults:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
# in-memory dict (native format, USD per 1M tokens)
|
|
53
|
+
wrap(client, pricing={"openai": {"gpt-5": {"input": 1.25, "output": 10.0, "cache_read": 0.125}}})
|
|
54
|
+
|
|
55
|
+
# or a JSON file (native or LiteLLM model_prices_and_context_window.json format)
|
|
56
|
+
wrap(client, pricing="pricing.json")
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Or point at a file process-wide with `CACHE_LENS_PRICING=/path/to/pricing.json`.
|
|
60
|
+
A bad pricing file falls back to defaults rather than breaking the run. See
|
|
61
|
+
[SPEC.md §12](SPEC.md#12-pricing-table).
|
|
62
|
+
|
|
63
|
+
## Status
|
|
64
|
+
|
|
65
|
+
v1.0. Implemented: wrapper interception with **request capture**, provider
|
|
66
|
+
extraction + capture (Anthropic + Gemini + OpenAI), **content-based layer
|
|
67
|
+
classification** (longest-common-prefix → named system_prompt / context /
|
|
68
|
+
conversation layers, cross-referenced against actual cache reads),
|
|
69
|
+
terminal/JSON/OTEL outputs, overridable pricing, tests.
|
|
70
|
+
Pending: `cache-lens run` CLI injection, streaming support, and cross-run
|
|
71
|
+
static/semi-static separation (see [docs/architecture.md](docs/architecture.md)).
|
|
72
|
+
|
|
73
|
+
## Develop
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install -e .[dev]
|
|
77
|
+
pytest
|
|
78
|
+
```
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""cache-lens — non-invasive prompt cache instrumentation for LLM API apps."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .models import LayerReport, RawCallMetrics, SessionReport
|
|
6
|
+
from .wrapper import CacheLens, CacheLensClient, wrap
|
|
7
|
+
|
|
8
|
+
__version__ = "1.0.0"
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"wrap",
|
|
12
|
+
"CacheLens",
|
|
13
|
+
"CacheLensClient",
|
|
14
|
+
"RawCallMetrics",
|
|
15
|
+
"LayerReport",
|
|
16
|
+
"SessionReport",
|
|
17
|
+
"__version__",
|
|
18
|
+
]
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Aggregate intercepted calls into a SessionReport.
|
|
2
|
+
|
|
3
|
+
Layer classification is content-based: the analyzer reconstructs each call's
|
|
4
|
+
prompt as an ordered list of segments, finds the longest prefix that is byte
|
|
5
|
+
-identical across every call in the session (the cacheable region), and names
|
|
6
|
+
the layers within it. It then cross-references that content-derived prefix
|
|
7
|
+
against the cache-read tokens the provider actually reported — surfacing which
|
|
8
|
+
named layer is stable-but-uncached and what it costs.
|
|
9
|
+
|
|
10
|
+
Token attribution per layer is estimated by character share, then scaled so each
|
|
11
|
+
call's layer tokens sum to the *real* input_tokens the provider returned. Overall
|
|
12
|
+
session aggregates (cost, savings, hit rate) are computed exactly from the
|
|
13
|
+
response metrics; only the per-layer split is an estimate.
|
|
14
|
+
|
|
15
|
+
Static vs semi-static is a single-run heuristic (a system-role prefix segment is
|
|
16
|
+
static; other stable-prefix content is semi-static). True static/semi-static
|
|
17
|
+
separation needs cross-run comparison, which a single in-memory session can't see.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import uuid
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from typing import Dict, List
|
|
25
|
+
|
|
26
|
+
from . import pricing
|
|
27
|
+
from .models import CallCapture, LayerReport, RawCallMetrics, SessionReport
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def analyze(captures: List[CallCapture], session_id: str = "") -> SessionReport:
|
|
31
|
+
session_id = session_id or str(uuid.uuid4())
|
|
32
|
+
now = datetime.now(timezone.utc)
|
|
33
|
+
|
|
34
|
+
if not captures:
|
|
35
|
+
return SessionReport(
|
|
36
|
+
session_id=session_id,
|
|
37
|
+
provider="",
|
|
38
|
+
model="",
|
|
39
|
+
started_at=now,
|
|
40
|
+
ended_at=now,
|
|
41
|
+
total_calls=0,
|
|
42
|
+
total_turns=0,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
metrics = [c.metrics for c in captures]
|
|
46
|
+
provider = metrics[0].provider
|
|
47
|
+
model = metrics[0].model
|
|
48
|
+
|
|
49
|
+
total_input = sum(m.input_tokens for m in metrics)
|
|
50
|
+
total_output = sum(m.output_tokens for m in metrics)
|
|
51
|
+
total_cached = sum(m.cache_read_tokens for m in metrics)
|
|
52
|
+
total_miss = sum(m.cache_miss_tokens for m in metrics)
|
|
53
|
+
|
|
54
|
+
actual_cost = _actual_cost(metrics)
|
|
55
|
+
cold_cost = _cold_cost(metrics)
|
|
56
|
+
savings = max(cold_cost - actual_cost, 0.0)
|
|
57
|
+
overall_hit_rate = (total_cached / total_input) if total_input else 0.0
|
|
58
|
+
|
|
59
|
+
layers, prefix_len, prefix_per_call_tokens = _classify_layers(captures, provider, model)
|
|
60
|
+
|
|
61
|
+
input_rate = pricing.rate(provider, model, "input")
|
|
62
|
+
read_rate = pricing.rate(provider, model, "cache_read")
|
|
63
|
+
theoretical_max = max(
|
|
64
|
+
prefix_per_call_tokens * (len(captures) - 1) * (input_rate - read_rate), 0.0
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
report = SessionReport(
|
|
68
|
+
session_id=session_id,
|
|
69
|
+
provider=provider,
|
|
70
|
+
model=model,
|
|
71
|
+
started_at=metrics[0].timestamp,
|
|
72
|
+
ended_at=metrics[-1].timestamp,
|
|
73
|
+
total_calls=len(metrics),
|
|
74
|
+
total_turns=len(metrics),
|
|
75
|
+
layers=layers,
|
|
76
|
+
total_input_tokens=total_input,
|
|
77
|
+
total_output_tokens=total_output,
|
|
78
|
+
total_cached_tokens=total_cached,
|
|
79
|
+
overall_hit_rate=overall_hit_rate,
|
|
80
|
+
actual_cost_usd=round(actual_cost, 6),
|
|
81
|
+
cold_cost_usd=round(cold_cost, 6),
|
|
82
|
+
total_savings_usd=round(savings, 6),
|
|
83
|
+
theoretical_max_savings_usd=round(theoretical_max, 6),
|
|
84
|
+
)
|
|
85
|
+
report.tips = _build_tips(report, captures, layers, prefix_len, total_miss)
|
|
86
|
+
return report
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _actual_cost(metrics: List[RawCallMetrics]) -> float:
|
|
90
|
+
cost = 0.0
|
|
91
|
+
for m in metrics:
|
|
92
|
+
cost += m.cache_miss_tokens * pricing.rate(m.provider, m.model, "input")
|
|
93
|
+
cost += m.cache_creation_tokens * pricing.rate(m.provider, m.model, "cache_write")
|
|
94
|
+
cost += m.cache_read_tokens * pricing.rate(m.provider, m.model, "cache_read")
|
|
95
|
+
cost += m.output_tokens * pricing.rate(m.provider, m.model, "output")
|
|
96
|
+
return cost
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _cold_cost(metrics: List[RawCallMetrics]) -> float:
|
|
100
|
+
"""Cost if every input token were billed at the full input rate."""
|
|
101
|
+
cost = 0.0
|
|
102
|
+
for m in metrics:
|
|
103
|
+
cost += m.input_tokens * pricing.rate(m.provider, m.model, "input")
|
|
104
|
+
cost += m.output_tokens * pricing.rate(m.provider, m.model, "output")
|
|
105
|
+
return cost
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _common_prefix_len(captures: List[CallCapture]) -> int:
|
|
109
|
+
"""Number of leading segments identical (role + text) across all calls."""
|
|
110
|
+
seq_lists = [c.segments for c in captures]
|
|
111
|
+
if not seq_lists or any(not s for s in seq_lists):
|
|
112
|
+
return 0
|
|
113
|
+
shortest = min(len(s) for s in seq_lists)
|
|
114
|
+
n = 0
|
|
115
|
+
for i in range(shortest):
|
|
116
|
+
first = seq_lists[0][i]
|
|
117
|
+
if all(
|
|
118
|
+
s[i].role == first.role and s[i].text == first.text for s in seq_lists
|
|
119
|
+
):
|
|
120
|
+
n += 1
|
|
121
|
+
else:
|
|
122
|
+
break
|
|
123
|
+
return n
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _classify_layers(captures: List[CallCapture], provider: str, model: str):
|
|
127
|
+
"""Return (layers, prefix_len, prefix_tokens_per_call)."""
|
|
128
|
+
prefix_len = _common_prefix_len(captures)
|
|
129
|
+
|
|
130
|
+
sys_tok = ctx_tok = conv_tok = 0.0
|
|
131
|
+
for cap in captures:
|
|
132
|
+
segs = cap.segments
|
|
133
|
+
total_chars = sum(len(s.text) for s in segs)
|
|
134
|
+
if total_chars <= 0:
|
|
135
|
+
# No capturable content — attribute everything to the dynamic layer.
|
|
136
|
+
conv_tok += cap.metrics.input_tokens
|
|
137
|
+
continue
|
|
138
|
+
for i, seg in enumerate(segs):
|
|
139
|
+
tok = cap.metrics.input_tokens * (len(seg.text) / total_chars)
|
|
140
|
+
if i < prefix_len:
|
|
141
|
+
if seg.role == "system":
|
|
142
|
+
sys_tok += tok
|
|
143
|
+
else:
|
|
144
|
+
ctx_tok += tok
|
|
145
|
+
else:
|
|
146
|
+
conv_tok += tok
|
|
147
|
+
|
|
148
|
+
# The stable prefix is sent on every call; cache reads (prefix-based) are
|
|
149
|
+
# attributed to it, split across system_prompt and context by token share.
|
|
150
|
+
prefix_tok = sys_tok + ctx_tok
|
|
151
|
+
cached_total = sum(c.metrics.cache_read_tokens for c in captures)
|
|
152
|
+
cached_in_prefix = min(cached_total, prefix_tok)
|
|
153
|
+
sys_cached = cached_in_prefix * (sys_tok / prefix_tok) if prefix_tok else 0.0
|
|
154
|
+
ctx_cached = cached_in_prefix - sys_cached
|
|
155
|
+
|
|
156
|
+
input_rate = pricing.rate(provider, model, "input")
|
|
157
|
+
read_rate = pricing.rate(provider, model, "cache_read")
|
|
158
|
+
|
|
159
|
+
def make(name: str, layer_type: str, total: float, cached: float) -> LayerReport:
|
|
160
|
+
cached = min(cached, total)
|
|
161
|
+
cold = total * input_rate
|
|
162
|
+
actual = cached * read_rate + (total - cached) * input_rate
|
|
163
|
+
return LayerReport(
|
|
164
|
+
name=name,
|
|
165
|
+
layer_type=layer_type,
|
|
166
|
+
total_tokens=int(round(total)),
|
|
167
|
+
cached_tokens=int(round(cached)),
|
|
168
|
+
hit_rate=(cached / total) if total else 0.0,
|
|
169
|
+
actual_cost_usd=round(actual, 6),
|
|
170
|
+
cold_cost_usd=round(cold, 6),
|
|
171
|
+
savings_usd=round(max(cold - actual, 0.0), 6),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
layers: List[LayerReport] = []
|
|
175
|
+
if sys_tok > 0:
|
|
176
|
+
layers.append(make("system_prompt", "static", sys_tok, sys_cached))
|
|
177
|
+
if ctx_tok > 0:
|
|
178
|
+
layers.append(make("context", "semi_static", ctx_tok, ctx_cached))
|
|
179
|
+
if conv_tok > 0:
|
|
180
|
+
layers.append(make("conversation", "dynamic", conv_tok, 0.0))
|
|
181
|
+
|
|
182
|
+
prefix_tokens_per_call = _prefix_tokens_per_call(captures, prefix_len)
|
|
183
|
+
return layers, prefix_len, prefix_tokens_per_call
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _prefix_tokens_per_call(captures: List[CallCapture], prefix_len: int) -> float:
|
|
187
|
+
"""Estimated token size of the stable prefix as sent on a single call."""
|
|
188
|
+
if not captures or prefix_len <= 0:
|
|
189
|
+
return 0.0
|
|
190
|
+
cap = captures[0]
|
|
191
|
+
total_chars = sum(len(s.text) for s in cap.segments)
|
|
192
|
+
if total_chars <= 0:
|
|
193
|
+
return 0.0
|
|
194
|
+
return sum(
|
|
195
|
+
cap.metrics.input_tokens * (len(cap.segments[i].text) / total_chars)
|
|
196
|
+
for i in range(prefix_len)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _build_tips(
|
|
201
|
+
report: SessionReport,
|
|
202
|
+
captures: List[CallCapture],
|
|
203
|
+
layers: List[LayerReport],
|
|
204
|
+
prefix_len: int,
|
|
205
|
+
total_miss: int,
|
|
206
|
+
) -> List[str]:
|
|
207
|
+
tips: List[str] = []
|
|
208
|
+
by_name: Dict[str, LayerReport] = {layer.name: layer for layer in layers}
|
|
209
|
+
multi_call = report.total_calls > 1
|
|
210
|
+
have_content = any(c.segments for c in captures)
|
|
211
|
+
|
|
212
|
+
if report.provider == "gemini" and report.total_cached_tokens == 0:
|
|
213
|
+
tips.append(
|
|
214
|
+
"No Gemini context cache detected — create a cacheContent object for "
|
|
215
|
+
"stable context (system prompt, schema) to enable cache reads."
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
context = by_name.get("context")
|
|
219
|
+
if context and multi_call and context.hit_rate < 0.5:
|
|
220
|
+
tips.append(
|
|
221
|
+
f"context layer (~{context.total_tokens:,} tokens) is identical across "
|
|
222
|
+
f"all {report.total_calls} calls but only {context.hit_rate:.0%} cached — "
|
|
223
|
+
f"move it behind a cache_control breakpoint before the conversation "
|
|
224
|
+
f"history (est. ${report.theoretical_max_savings_usd:.3f} recoverable)."
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
system = by_name.get("system_prompt")
|
|
228
|
+
if system and multi_call and system.hit_rate < 0.9:
|
|
229
|
+
tips.append(
|
|
230
|
+
f"system_prompt cache hit rate is {system.hit_rate:.0%} — check the "
|
|
231
|
+
"prefix isn't being prepended with dynamic content that breaks the cache."
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if have_content and multi_call and prefix_len == 0:
|
|
235
|
+
tips.append(
|
|
236
|
+
"No stable prompt prefix detected across calls — content differs every "
|
|
237
|
+
"turn, so prefix caching cannot help. Ensure your system prompt and "
|
|
238
|
+
"static context are byte-identical on each call (and placed first)."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if captures and captures[0].metrics.cache_read_tokens == 0:
|
|
242
|
+
tips.append(
|
|
243
|
+
"First call always misses the cache (expected). Pre-warm with a dummy "
|
|
244
|
+
"call before the loop starts to eliminate the cold miss."
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if report.total_input_tokens and total_miss / report.total_input_tokens > 0.3:
|
|
248
|
+
tips.append(
|
|
249
|
+
f"{total_miss / report.total_input_tokens:.0%} of input tokens are "
|
|
250
|
+
"uncached and re-sent each turn — consider summarising tool results "
|
|
251
|
+
"instead of appending them verbatim."
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
return tips
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""cache-lens CLI: `cache-lens run <command>` for zero-code instrumentation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main(argv: list = None) -> int:
|
|
9
|
+
argv = argv if argv is not None else sys.argv[1:]
|
|
10
|
+
if not argv or argv[0] in ("-h", "--help"):
|
|
11
|
+
_print_usage()
|
|
12
|
+
return 0
|
|
13
|
+
|
|
14
|
+
cmd, rest = argv[0], argv[1:]
|
|
15
|
+
if cmd == "run":
|
|
16
|
+
return _run(rest)
|
|
17
|
+
|
|
18
|
+
sys.stderr.write(f"cache-lens: unknown command '{cmd}'\n")
|
|
19
|
+
_print_usage()
|
|
20
|
+
return 2
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _run(command: list) -> int:
|
|
24
|
+
if not command:
|
|
25
|
+
sys.stderr.write("cache-lens run: no command given\n")
|
|
26
|
+
return 2
|
|
27
|
+
# v1.0: sitecustomize injection that patches the SDK at import time and
|
|
28
|
+
# registers an atexit report. Implementation tracked in docs/architecture.md.
|
|
29
|
+
raise NotImplementedError(
|
|
30
|
+
"cache-lens run is scaffolded; sitecustomize injection not yet implemented"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _print_usage() -> None:
|
|
35
|
+
sys.stdout.write(
|
|
36
|
+
"cache-lens — prompt cache instrumentation\n\n"
|
|
37
|
+
"Usage:\n"
|
|
38
|
+
" cache-lens run <command> [args...] Instrument a subprocess\n"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
raise SystemExit(main())
|