cachelens 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cachelens-1.0.0/.gitignore +10 -0
  2. cachelens-1.0.0/CHANGELOG.md +20 -0
  3. cachelens-1.0.0/CLAUDE.md +55 -0
  4. cachelens-1.0.0/CONTRIBUTING.md +38 -0
  5. cachelens-1.0.0/LICENSE +21 -0
  6. cachelens-1.0.0/PKG-INFO +121 -0
  7. cachelens-1.0.0/README.md +78 -0
  8. cachelens-1.0.0/cache_lens/__init__.py +18 -0
  9. cachelens-1.0.0/cache_lens/analyzer.py +254 -0
  10. cachelens-1.0.0/cache_lens/cli.py +43 -0
  11. cachelens-1.0.0/cache_lens/models.py +75 -0
  12. cachelens-1.0.0/cache_lens/outputs/__init__.py +1 -0
  13. cachelens-1.0.0/cache_lens/outputs/json_export.py +48 -0
  14. cachelens-1.0.0/cache_lens/outputs/otel.py +68 -0
  15. cachelens-1.0.0/cache_lens/outputs/terminal.py +67 -0
  16. cachelens-1.0.0/cache_lens/pricing.py +183 -0
  17. cachelens-1.0.0/cache_lens/providers/__init__.py +58 -0
  18. cachelens-1.0.0/cache_lens/providers/anthropic.py +54 -0
  19. cachelens-1.0.0/cache_lens/providers/gemini.py +79 -0
  20. cachelens-1.0.0/cache_lens/providers/openai.py +62 -0
  21. cachelens-1.0.0/cache_lens/wrapper.py +195 -0
  22. cachelens-1.0.0/docs/architecture.md +52 -0
  23. cachelens-1.0.0/docs/positioning.md +109 -0
  24. cachelens-1.0.0/examples/anthropic_basic.py +25 -0
  25. cachelens-1.0.0/examples/openai_basic.py +22 -0
  26. cachelens-1.0.0/examples/queryargus_demo.py +49 -0
  27. cachelens-1.0.0/pyproject.toml +61 -0
  28. cachelens-1.0.0/tests/conftest.py +35 -0
  29. cachelens-1.0.0/tests/fixtures/anthropic_responses.json +20 -0
  30. cachelens-1.0.0/tests/fixtures/gemini_responses.json +18 -0
  31. cachelens-1.0.0/tests/fixtures/litellm_pricing.json +20 -0
  32. cachelens-1.0.0/tests/fixtures/openai_responses.json +22 -0
  33. cachelens-1.0.0/tests/providers/test_anthropic.py +24 -0
  34. cachelens-1.0.0/tests/providers/test_gemini.py +24 -0
  35. cachelens-1.0.0/tests/providers/test_openai.py +44 -0
  36. cachelens-1.0.0/tests/test_analyzer.py +117 -0
  37. cachelens-1.0.0/tests/test_capture.py +86 -0
  38. cachelens-1.0.0/tests/test_pricing.py +69 -0
  39. cachelens-1.0.0/tests/test_wrapper.py +97 -0
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ *.egg-info/
5
+ build/
6
+ dist/
7
+ .pytest_cache/
8
+ cache_reports/
9
+ SPEC.md
10
+ .claude/
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [1.0.0] - 2026-06-09
11
+
12
+ ### Added
13
+ - Wrapper interception for Anthropic, Gemini, and OpenAI clients (`wrap`, `CacheLens`, `CacheLensClient`)
14
+ - Request capture: normalises prompt to ordered `PromptSegment` list per call
15
+ - Content-based layer classification via longest-common-prefix analysis (system_prompt / context / conversation layers)
16
+ - Terminal report with cache hit rate, cost, savings, and per-layer breakdown
17
+ - JSON export (`json_export=` arg or `CACHE_LENS_JSON` env var)
18
+ - OpenTelemetry metrics output (`otel=True`)
19
+ - Overridable pricing table (native dict, JSON file, or `CACHE_LENS_PRICING` env var; LiteLLM format auto-detected)
20
+ - Gemini support for modern `google-genai` SDK (`config` kwarg pattern)
@@ -0,0 +1,55 @@
1
+ # CLAUDE.md — cache-lens
2
+
3
+ Context for Claude Code sessions on this repo.
4
+
5
+ ## What this is
6
+
7
+ A Python library that instruments prompt caching in LLM API apps (Anthropic,
8
+ Gemini, OpenAI). You wrap a provider client; on each intercepted call it captures
9
+ both the **request prompt** (normalised to ordered `PromptSegment`s) and the
10
+ **response cache metrics**. At session end the analyzer does content-based layer
11
+ classification and produces a `SessionReport` rendered to terminal / JSON / OTEL.
12
+ Full design in [SPEC.md](SPEC.md).
13
+
14
+ The differentiator (see [docs/positioning.md](docs/positioning.md)): it doesn't
15
+ just report cached-token counts (LiteLLM/Langfuse/Helicone already do that) — it
16
+ diffs the prompt prefix across calls to name *which layer* is stable-but-uncached
17
+ and what restructuring would save.
18
+
19
+ ## Layout
20
+
21
+ - `cache_lens/wrapper.py` — interception (`wrap`, `CacheLens`, `CacheLensClient`);
22
+ stores `List[CallCapture]` (request segments + response metrics) per session
23
+ - `cache_lens/providers/{anthropic,gemini,openai}.py` — `extract()` (response →
24
+ `RawCallMetrics`) and `capture()` (request → `List[PromptSegment]`)
25
+ - `cache_lens/analyzer.py` — longest-common-prefix layer classification, cost,
26
+ savings, ceiling, content-aware tips
27
+ - `cache_lens/models.py` — dataclasses (`RawCallMetrics`, `PromptSegment`,
28
+ `CallCapture`, `LayerReport`, `SessionReport`)
29
+ - `cache_lens/pricing.py` — price registry (USD per 1M tokens): bundled
30
+ `DEFAULT_PRICING` + runtime overrides via `CACHE_LENS_PRICING` env var or
31
+ `pricing=` arg (native or LiteLLM JSON, auto-detected, merged over defaults)
32
+ - `cache_lens/outputs/{terminal,json_export,otel}.py` — sinks
33
+ - `cache_lens/cli.py` — `cache-lens run` (scaffolded, not implemented)
34
+ - `tests/` — pytest; fixtures are JSON, loaded via `tests/conftest.py`
35
+
36
+ ## Conventions
37
+
38
+ - Never let instrumentation break the wrapped caller — both `capture()` and
39
+ `extract()` are wrapped in try/except in `wrapper._wrap_call`; capture failure
40
+ yields empty segments (analyzer degrades to no layer diagnosis, aggregates still
41
+ exact).
42
+ - Provider SDKs and OTEL are optional deps; import them lazily inside functions,
43
+ not at module top level.
44
+ - Pricing/cost is per-token internally (`pricing.rate`), table is per-1M.
45
+ - Overall session aggregates (cost, savings, hit rate) are computed **exactly**
46
+ from response metrics; per-layer token splits are **estimated** by char-share
47
+ scaled to the real `input_tokens` (no tokenizer dep).
48
+ - Cost model: actual = miss·input + creation·cache_write + read·cache_read +
49
+ output·output; cold = all input at full input rate.
50
+
51
+ ## Run tests
52
+
53
+ ```bash
54
+ pip install -e .[dev] && pytest
55
+ ```
@@ -0,0 +1,38 @@
1
+ # Contributing to cachelens
2
+
3
+ Thanks for your interest in contributing!
4
+
5
+ ## Setup
6
+
7
+ ```bash
8
+ git clone https://github.com/ChingEnLin/CacheLens.git
9
+ cd CacheLens
10
+ pip install -e .[dev]
11
+ ```
12
+
13
+ ## Running tests
14
+
15
+ ```bash
16
+ pytest
17
+ ```
18
+
19
+ All tests must pass before opening a PR.
20
+
21
+ ## Adding a provider
22
+
23
+ 1. Create `cache_lens/providers/<name>.py` with `extract(response) -> RawCallMetrics` and `capture(request, client) -> List[PromptSegment]`
24
+ 2. Register it in `cache_lens/wrapper.py` (`_detect_provider`)
25
+ 3. Add tests in `tests/providers/test_<name>.py`
26
+
27
+ ## Pull requests
28
+
29
+ - Keep PRs focused — one feature or fix per PR
30
+ - Include tests for new behavior
31
+ - Update `CHANGELOG.md` under `[Unreleased]`
32
+
33
+ ## Reporting issues
34
+
35
+ Open an issue at https://github.com/ChingEnLin/CacheLens/issues with:
36
+ - Python version and OS
37
+ - Provider SDK version
38
+ - Minimal reproducer
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ching En Lin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.1
2
+ Name: cachelens
3
+ Version: 1.0.0
4
+ Summary: Non-invasive prompt cache instrumentation for LLM API apps
5
+ Project-URL: Homepage, https://github.com/ChingEnLin/CacheLens
6
+ Project-URL: Repository, https://github.com/ChingEnLin/CacheLens
7
+ Project-URL: Issues, https://github.com/ChingEnLin/CacheLens/issues
8
+ Author-email: Ching En Lin <chingenlin71@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: anthropic,gemini,llm,observability,otel,prompt-caching
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Requires-Python: >=3.8
24
+ Requires-Dist: rich>=13.0
25
+ Provides-Extra: all
26
+ Requires-Dist: anthropic>=0.40; extra == 'all'
27
+ Requires-Dist: google-generativeai>=0.8; extra == 'all'
28
+ Requires-Dist: openai>=1.40; extra == 'all'
29
+ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc>=1.20; extra == 'all'
30
+ Requires-Dist: opentelemetry-sdk>=1.20; extra == 'all'
31
+ Provides-Extra: anthropic
32
+ Requires-Dist: anthropic>=0.40; extra == 'anthropic'
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.0; extra == 'dev'
35
+ Provides-Extra: gemini
36
+ Requires-Dist: google-generativeai>=0.8; extra == 'gemini'
37
+ Provides-Extra: openai
38
+ Requires-Dist: openai>=1.40; extra == 'openai'
39
+ Provides-Extra: otel
40
+ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc>=1.20; extra == 'otel'
41
+ Requires-Dist: opentelemetry-sdk>=1.20; extra == 'otel'
42
+ Description-Content-Type: text/markdown
43
+
44
+ # cache-lens
45
+
46
+ > Non-invasive prompt cache instrumentation for LLM API apps.
47
+ > Wrap your client in one line. Get terminal reports, JSON exports, and OTEL metrics.
48
+
49
+ Prompt caching gives steep discounts on cached tokens — but nothing tells you
50
+ whether your app is actually getting cache hits, or why not. cache-lens wraps
51
+ your Anthropic, Gemini, or OpenAI client and reports cache hit rate, cost,
52
+ savings, and the money you're leaving on the table, broken down by prompt layer.
53
+
54
+ See [SPEC.md](SPEC.md) for the full design.
55
+
56
+ ## Install
57
+
58
+ ```bash
59
+ pip install cache-lens # core + rich
60
+ pip install cache-lens[anthropic] # + Anthropic SDK
61
+ pip install cache-lens[gemini] # + Gemini SDK
62
+ pip install cache-lens[openai] # + OpenAI SDK
63
+ pip install cache-lens[otel] # + OpenTelemetry
64
+ pip install cache-lens[all] # everything
65
+ ```
66
+
67
+ ## Quickstart
68
+
69
+ ```python
70
+ import anthropic
71
+ from cache_lens import wrap
72
+
73
+ client = wrap(anthropic.Anthropic())
74
+ # ... use client exactly as before; report prints on exit
75
+ ```
76
+
77
+ Explicit session boundary with exports:
78
+
79
+ ```python
80
+ from cache_lens import CacheLens
81
+
82
+ with CacheLens(client, json_export="report.json", otel=True) as session:
83
+ agent.run(...) # your code, unchanged
84
+ report = session.report
85
+ ```
86
+
87
+ Suppress the terminal report in CI with `CACHE_LENS_TERMINAL=0`.
88
+
89
+ ## Custom pricing
90
+
91
+ cache-lens ships a default price table, but you can override or extend it without
92
+ forking — handy when a new model lands. User entries merge over the defaults:
93
+
94
+ ```python
95
+ # in-memory dict (native format, USD per 1M tokens)
96
+ wrap(client, pricing={"openai": {"gpt-5": {"input": 1.25, "output": 10.0, "cache_read": 0.125}}})
97
+
98
+ # or a JSON file (native or LiteLLM model_prices_and_context_window.json format)
99
+ wrap(client, pricing="pricing.json")
100
+ ```
101
+
102
+ Or point at a file process-wide with `CACHE_LENS_PRICING=/path/to/pricing.json`.
103
+ A bad pricing file falls back to defaults rather than breaking the run. See
104
+ [SPEC.md §12](SPEC.md#12-pricing-table).
105
+
106
+ ## Status
107
+
108
+ v1.0. Implemented: wrapper interception with **request capture**, provider
109
+ extraction + capture (Anthropic + Gemini + OpenAI), **content-based layer
110
+ classification** (longest-common-prefix → named system_prompt / context /
111
+ conversation layers, cross-referenced against actual cache reads),
112
+ terminal/JSON/OTEL outputs, overridable pricing, tests.
113
+ Pending: `cache-lens run` CLI injection, streaming support, and cross-run
114
+ static/semi-static separation (see [docs/architecture.md](docs/architecture.md)).
115
+
116
+ ## Develop
117
+
118
+ ```bash
119
+ pip install -e .[dev]
120
+ pytest
121
+ ```
@@ -0,0 +1,78 @@
1
+ # cache-lens
2
+
3
+ > Non-invasive prompt cache instrumentation for LLM API apps.
4
+ > Wrap your client in one line. Get terminal reports, JSON exports, and OTEL metrics.
5
+
6
+ Prompt caching gives steep discounts on cached tokens — but nothing tells you
7
+ whether your app is actually getting cache hits, or why not. cache-lens wraps
8
+ your Anthropic, Gemini, or OpenAI client and reports cache hit rate, cost,
9
+ savings, and the money you're leaving on the table, broken down by prompt layer.
10
+
11
+ See [SPEC.md](SPEC.md) for the full design.
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ pip install cache-lens # core + rich
17
+ pip install cache-lens[anthropic] # + Anthropic SDK
18
+ pip install cache-lens[gemini] # + Gemini SDK
19
+ pip install cache-lens[openai] # + OpenAI SDK
20
+ pip install cache-lens[otel] # + OpenTelemetry
21
+ pip install cache-lens[all] # everything
22
+ ```
23
+
24
+ ## Quickstart
25
+
26
+ ```python
27
+ import anthropic
28
+ from cache_lens import wrap
29
+
30
+ client = wrap(anthropic.Anthropic())
31
+ # ... use client exactly as before; report prints on exit
32
+ ```
33
+
34
+ Explicit session boundary with exports:
35
+
36
+ ```python
37
+ from cache_lens import CacheLens
38
+
39
+ with CacheLens(client, json_export="report.json", otel=True) as session:
40
+ agent.run(...) # your code, unchanged
41
+ report = session.report
42
+ ```
43
+
44
+ Suppress the terminal report in CI with `CACHE_LENS_TERMINAL=0`.
45
+
46
+ ## Custom pricing
47
+
48
+ cache-lens ships a default price table, but you can override or extend it without
49
+ forking — handy when a new model lands. User entries merge over the defaults:
50
+
51
+ ```python
52
+ # in-memory dict (native format, USD per 1M tokens)
53
+ wrap(client, pricing={"openai": {"gpt-5": {"input": 1.25, "output": 10.0, "cache_read": 0.125}}})
54
+
55
+ # or a JSON file (native or LiteLLM model_prices_and_context_window.json format)
56
+ wrap(client, pricing="pricing.json")
57
+ ```
58
+
59
+ Or point at a file process-wide with `CACHE_LENS_PRICING=/path/to/pricing.json`.
60
+ A bad pricing file falls back to defaults rather than breaking the run. See
61
+ [SPEC.md §12](SPEC.md#12-pricing-table).
62
+
63
+ ## Status
64
+
65
+ v1.0. Implemented: wrapper interception with **request capture**, provider
66
+ extraction + capture (Anthropic + Gemini + OpenAI), **content-based layer
67
+ classification** (longest-common-prefix → named system_prompt / context /
68
+ conversation layers, cross-referenced against actual cache reads),
69
+ terminal/JSON/OTEL outputs, overridable pricing, tests.
70
+ Pending: `cache-lens run` CLI injection, streaming support, and cross-run
71
+ static/semi-static separation (see [docs/architecture.md](docs/architecture.md)).
72
+
73
+ ## Develop
74
+
75
+ ```bash
76
+ pip install -e .[dev]
77
+ pytest
78
+ ```
@@ -0,0 +1,18 @@
1
+ """cache-lens — non-invasive prompt cache instrumentation for LLM API apps."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .models import LayerReport, RawCallMetrics, SessionReport
6
+ from .wrapper import CacheLens, CacheLensClient, wrap
7
+
8
+ __version__ = "1.0.0"
9
+
10
+ __all__ = [
11
+ "wrap",
12
+ "CacheLens",
13
+ "CacheLensClient",
14
+ "RawCallMetrics",
15
+ "LayerReport",
16
+ "SessionReport",
17
+ "__version__",
18
+ ]
@@ -0,0 +1,254 @@
1
+ """Aggregate intercepted calls into a SessionReport.
2
+
3
+ Layer classification is content-based: the analyzer reconstructs each call's
4
+ prompt as an ordered list of segments, finds the longest prefix that is byte
5
+ -identical across every call in the session (the cacheable region), and names
6
+ the layers within it. It then cross-references that content-derived prefix
7
+ against the cache-read tokens the provider actually reported — surfacing which
8
+ named layer is stable-but-uncached and what it costs.
9
+
10
+ Token attribution per layer is estimated by character share, then scaled so each
11
+ call's layer tokens sum to the *real* input_tokens the provider returned. Overall
12
+ session aggregates (cost, savings, hit rate) are computed exactly from the
13
+ response metrics; only the per-layer split is an estimate.
14
+
15
+ Static vs semi-static is a single-run heuristic (a system-role prefix segment is
16
+ static; other stable-prefix content is semi-static). True static/semi-static
17
+ separation needs cross-run comparison, which a single in-memory session can't see.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import uuid
23
+ from datetime import datetime, timezone
24
+ from typing import Dict, List
25
+
26
+ from . import pricing
27
+ from .models import CallCapture, LayerReport, RawCallMetrics, SessionReport
28
+
29
+
30
+ def analyze(captures: List[CallCapture], session_id: str = "") -> SessionReport:
31
+ session_id = session_id or str(uuid.uuid4())
32
+ now = datetime.now(timezone.utc)
33
+
34
+ if not captures:
35
+ return SessionReport(
36
+ session_id=session_id,
37
+ provider="",
38
+ model="",
39
+ started_at=now,
40
+ ended_at=now,
41
+ total_calls=0,
42
+ total_turns=0,
43
+ )
44
+
45
+ metrics = [c.metrics for c in captures]
46
+ provider = metrics[0].provider
47
+ model = metrics[0].model
48
+
49
+ total_input = sum(m.input_tokens for m in metrics)
50
+ total_output = sum(m.output_tokens for m in metrics)
51
+ total_cached = sum(m.cache_read_tokens for m in metrics)
52
+ total_miss = sum(m.cache_miss_tokens for m in metrics)
53
+
54
+ actual_cost = _actual_cost(metrics)
55
+ cold_cost = _cold_cost(metrics)
56
+ savings = max(cold_cost - actual_cost, 0.0)
57
+ overall_hit_rate = (total_cached / total_input) if total_input else 0.0
58
+
59
+ layers, prefix_len, prefix_per_call_tokens = _classify_layers(captures, provider, model)
60
+
61
+ input_rate = pricing.rate(provider, model, "input")
62
+ read_rate = pricing.rate(provider, model, "cache_read")
63
+ theoretical_max = max(
64
+ prefix_per_call_tokens * (len(captures) - 1) * (input_rate - read_rate), 0.0
65
+ )
66
+
67
+ report = SessionReport(
68
+ session_id=session_id,
69
+ provider=provider,
70
+ model=model,
71
+ started_at=metrics[0].timestamp,
72
+ ended_at=metrics[-1].timestamp,
73
+ total_calls=len(metrics),
74
+ total_turns=len(metrics),
75
+ layers=layers,
76
+ total_input_tokens=total_input,
77
+ total_output_tokens=total_output,
78
+ total_cached_tokens=total_cached,
79
+ overall_hit_rate=overall_hit_rate,
80
+ actual_cost_usd=round(actual_cost, 6),
81
+ cold_cost_usd=round(cold_cost, 6),
82
+ total_savings_usd=round(savings, 6),
83
+ theoretical_max_savings_usd=round(theoretical_max, 6),
84
+ )
85
+ report.tips = _build_tips(report, captures, layers, prefix_len, total_miss)
86
+ return report
87
+
88
+
89
+ def _actual_cost(metrics: List[RawCallMetrics]) -> float:
90
+ cost = 0.0
91
+ for m in metrics:
92
+ cost += m.cache_miss_tokens * pricing.rate(m.provider, m.model, "input")
93
+ cost += m.cache_creation_tokens * pricing.rate(m.provider, m.model, "cache_write")
94
+ cost += m.cache_read_tokens * pricing.rate(m.provider, m.model, "cache_read")
95
+ cost += m.output_tokens * pricing.rate(m.provider, m.model, "output")
96
+ return cost
97
+
98
+
99
+ def _cold_cost(metrics: List[RawCallMetrics]) -> float:
100
+ """Cost if every input token were billed at the full input rate."""
101
+ cost = 0.0
102
+ for m in metrics:
103
+ cost += m.input_tokens * pricing.rate(m.provider, m.model, "input")
104
+ cost += m.output_tokens * pricing.rate(m.provider, m.model, "output")
105
+ return cost
106
+
107
+
108
+ def _common_prefix_len(captures: List[CallCapture]) -> int:
109
+ """Number of leading segments identical (role + text) across all calls."""
110
+ seq_lists = [c.segments for c in captures]
111
+ if not seq_lists or any(not s for s in seq_lists):
112
+ return 0
113
+ shortest = min(len(s) for s in seq_lists)
114
+ n = 0
115
+ for i in range(shortest):
116
+ first = seq_lists[0][i]
117
+ if all(
118
+ s[i].role == first.role and s[i].text == first.text for s in seq_lists
119
+ ):
120
+ n += 1
121
+ else:
122
+ break
123
+ return n
124
+
125
+
126
+ def _classify_layers(captures: List[CallCapture], provider: str, model: str):
127
+ """Return (layers, prefix_len, prefix_tokens_per_call)."""
128
+ prefix_len = _common_prefix_len(captures)
129
+
130
+ sys_tok = ctx_tok = conv_tok = 0.0
131
+ for cap in captures:
132
+ segs = cap.segments
133
+ total_chars = sum(len(s.text) for s in segs)
134
+ if total_chars <= 0:
135
+ # No capturable content — attribute everything to the dynamic layer.
136
+ conv_tok += cap.metrics.input_tokens
137
+ continue
138
+ for i, seg in enumerate(segs):
139
+ tok = cap.metrics.input_tokens * (len(seg.text) / total_chars)
140
+ if i < prefix_len:
141
+ if seg.role == "system":
142
+ sys_tok += tok
143
+ else:
144
+ ctx_tok += tok
145
+ else:
146
+ conv_tok += tok
147
+
148
+ # The stable prefix is sent on every call; cache reads (prefix-based) are
149
+ # attributed to it, split across system_prompt and context by token share.
150
+ prefix_tok = sys_tok + ctx_tok
151
+ cached_total = sum(c.metrics.cache_read_tokens for c in captures)
152
+ cached_in_prefix = min(cached_total, prefix_tok)
153
+ sys_cached = cached_in_prefix * (sys_tok / prefix_tok) if prefix_tok else 0.0
154
+ ctx_cached = cached_in_prefix - sys_cached
155
+
156
+ input_rate = pricing.rate(provider, model, "input")
157
+ read_rate = pricing.rate(provider, model, "cache_read")
158
+
159
+ def make(name: str, layer_type: str, total: float, cached: float) -> LayerReport:
160
+ cached = min(cached, total)
161
+ cold = total * input_rate
162
+ actual = cached * read_rate + (total - cached) * input_rate
163
+ return LayerReport(
164
+ name=name,
165
+ layer_type=layer_type,
166
+ total_tokens=int(round(total)),
167
+ cached_tokens=int(round(cached)),
168
+ hit_rate=(cached / total) if total else 0.0,
169
+ actual_cost_usd=round(actual, 6),
170
+ cold_cost_usd=round(cold, 6),
171
+ savings_usd=round(max(cold - actual, 0.0), 6),
172
+ )
173
+
174
+ layers: List[LayerReport] = []
175
+ if sys_tok > 0:
176
+ layers.append(make("system_prompt", "static", sys_tok, sys_cached))
177
+ if ctx_tok > 0:
178
+ layers.append(make("context", "semi_static", ctx_tok, ctx_cached))
179
+ if conv_tok > 0:
180
+ layers.append(make("conversation", "dynamic", conv_tok, 0.0))
181
+
182
+ prefix_tokens_per_call = _prefix_tokens_per_call(captures, prefix_len)
183
+ return layers, prefix_len, prefix_tokens_per_call
184
+
185
+
186
+ def _prefix_tokens_per_call(captures: List[CallCapture], prefix_len: int) -> float:
187
+ """Estimated token size of the stable prefix as sent on a single call."""
188
+ if not captures or prefix_len <= 0:
189
+ return 0.0
190
+ cap = captures[0]
191
+ total_chars = sum(len(s.text) for s in cap.segments)
192
+ if total_chars <= 0:
193
+ return 0.0
194
+ return sum(
195
+ cap.metrics.input_tokens * (len(cap.segments[i].text) / total_chars)
196
+ for i in range(prefix_len)
197
+ )
198
+
199
+
200
+ def _build_tips(
201
+ report: SessionReport,
202
+ captures: List[CallCapture],
203
+ layers: List[LayerReport],
204
+ prefix_len: int,
205
+ total_miss: int,
206
+ ) -> List[str]:
207
+ tips: List[str] = []
208
+ by_name: Dict[str, LayerReport] = {layer.name: layer for layer in layers}
209
+ multi_call = report.total_calls > 1
210
+ have_content = any(c.segments for c in captures)
211
+
212
+ if report.provider == "gemini" and report.total_cached_tokens == 0:
213
+ tips.append(
214
+ "No Gemini context cache detected — create a cacheContent object for "
215
+ "stable context (system prompt, schema) to enable cache reads."
216
+ )
217
+
218
+ context = by_name.get("context")
219
+ if context and multi_call and context.hit_rate < 0.5:
220
+ tips.append(
221
+ f"context layer (~{context.total_tokens:,} tokens) is identical across "
222
+ f"all {report.total_calls} calls but only {context.hit_rate:.0%} cached — "
223
+ f"move it behind a cache_control breakpoint before the conversation "
224
+ f"history (est. ${report.theoretical_max_savings_usd:.3f} recoverable)."
225
+ )
226
+
227
+ system = by_name.get("system_prompt")
228
+ if system and multi_call and system.hit_rate < 0.9:
229
+ tips.append(
230
+ f"system_prompt cache hit rate is {system.hit_rate:.0%} — check the "
231
+ "prefix isn't being prepended with dynamic content that breaks the cache."
232
+ )
233
+
234
+ if have_content and multi_call and prefix_len == 0:
235
+ tips.append(
236
+ "No stable prompt prefix detected across calls — content differs every "
237
+ "turn, so prefix caching cannot help. Ensure your system prompt and "
238
+ "static context are byte-identical on each call (and placed first)."
239
+ )
240
+
241
+ if captures and captures[0].metrics.cache_read_tokens == 0:
242
+ tips.append(
243
+ "First call always misses the cache (expected). Pre-warm with a dummy "
244
+ "call before the loop starts to eliminate the cold miss."
245
+ )
246
+
247
+ if report.total_input_tokens and total_miss / report.total_input_tokens > 0.3:
248
+ tips.append(
249
+ f"{total_miss / report.total_input_tokens:.0%} of input tokens are "
250
+ "uncached and re-sent each turn — consider summarising tool results "
251
+ "instead of appending them verbatim."
252
+ )
253
+
254
+ return tips
@@ -0,0 +1,43 @@
1
+ """cache-lens CLI: `cache-lens run <command>` for zero-code instrumentation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+
8
+ def main(argv: list = None) -> int:
9
+ argv = argv if argv is not None else sys.argv[1:]
10
+ if not argv or argv[0] in ("-h", "--help"):
11
+ _print_usage()
12
+ return 0
13
+
14
+ cmd, rest = argv[0], argv[1:]
15
+ if cmd == "run":
16
+ return _run(rest)
17
+
18
+ sys.stderr.write(f"cache-lens: unknown command '{cmd}'\n")
19
+ _print_usage()
20
+ return 2
21
+
22
+
23
+ def _run(command: list) -> int:
24
+ if not command:
25
+ sys.stderr.write("cache-lens run: no command given\n")
26
+ return 2
27
+ # v1.0: sitecustomize injection that patches the SDK at import time and
28
+ # registers an atexit report. Implementation tracked in docs/architecture.md.
29
+ raise NotImplementedError(
30
+ "cache-lens run is scaffolded; sitecustomize injection not yet implemented"
31
+ )
32
+
33
+
34
+ def _print_usage() -> None:
35
+ sys.stdout.write(
36
+ "cache-lens — prompt cache instrumentation\n\n"
37
+ "Usage:\n"
38
+ " cache-lens run <command> [args...] Instrument a subprocess\n"
39
+ )
40
+
41
+
42
+ if __name__ == "__main__":
43
+ raise SystemExit(main())