headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Anthropic Cache Optimizer.
|
|
3
|
+
|
|
4
|
+
Implements cache optimization for Anthropic's explicit cache_control mechanism.
|
|
5
|
+
Anthropic uses ephemeral cache breakpoints to mark content that should be cached.
|
|
6
|
+
|
|
7
|
+
Anthropic Caching Characteristics:
|
|
8
|
+
- Explicit cache_control: {"type": "ephemeral"} blocks
|
|
9
|
+
- Minimum 1024 tokens for caching to be effective
|
|
10
|
+
- Maximum 4 cache breakpoints per request
|
|
11
|
+
- 5-minute TTL (extended on cache hit)
|
|
12
|
+
- Cost: 25% MORE to write to cache, 90% LESS to read
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
from headroom.cache import AnthropicCacheOptimizer, OptimizationContext
|
|
16
|
+
|
|
17
|
+
optimizer = AnthropicCacheOptimizer()
|
|
18
|
+
context = OptimizationContext(provider="anthropic", model="claude-3-opus")
|
|
19
|
+
|
|
20
|
+
result = optimizer.optimize(messages, context)
|
|
21
|
+
# result.messages now contains cache_control blocks
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import copy
|
|
27
|
+
import re
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
from .base import (
|
|
32
|
+
BaseCacheOptimizer,
|
|
33
|
+
BreakpointLocation,
|
|
34
|
+
CacheBreakpoint,
|
|
35
|
+
CacheConfig,
|
|
36
|
+
CacheMetrics,
|
|
37
|
+
CacheResult,
|
|
38
|
+
CacheStrategy,
|
|
39
|
+
OptimizationContext,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Anthropic-specific constants
|
|
43
|
+
ANTHROPIC_MIN_CACHEABLE_TOKENS = 1024
|
|
44
|
+
ANTHROPIC_MAX_BREAKPOINTS = 4
|
|
45
|
+
ANTHROPIC_CACHE_TTL_SECONDS = 300 # 5 minutes
|
|
46
|
+
ANTHROPIC_WRITE_COST_MULTIPLIER = 1.25 # 25% more to write
|
|
47
|
+
ANTHROPIC_READ_COST_MULTIPLIER = 0.10 # 90% less to read
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ContentSection:
|
|
52
|
+
"""Represents a section of content that may be cacheable."""
|
|
53
|
+
|
|
54
|
+
content: str | list[dict[str, Any]]
|
|
55
|
+
section_type: str # "system", "tools", "examples", "user", "assistant"
|
|
56
|
+
message_index: int
|
|
57
|
+
content_index: int | None = None
|
|
58
|
+
token_count: int = 0
|
|
59
|
+
is_cacheable: bool = False
|
|
60
|
+
reason: str = ""
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class BreakpointPlan:
|
|
65
|
+
"""Plan for where to insert cache breakpoints."""
|
|
66
|
+
|
|
67
|
+
breakpoints: list[CacheBreakpoint] = field(default_factory=list)
|
|
68
|
+
total_cacheable_tokens: int = 0
|
|
69
|
+
estimated_savings_percent: float = 0.0
|
|
70
|
+
warnings: list[str] = field(default_factory=list)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class AnthropicCacheOptimizer(BaseCacheOptimizer):
|
|
74
|
+
"""
|
|
75
|
+
Cache optimizer for Anthropic's explicit cache_control mechanism.
|
|
76
|
+
|
|
77
|
+
This optimizer analyzes messages and inserts cache_control blocks at
|
|
78
|
+
optimal positions to maximize cache hit rates and minimize costs.
|
|
79
|
+
|
|
80
|
+
Key features:
|
|
81
|
+
- Detects cacheable sections (system prompt, tools, few-shot examples)
|
|
82
|
+
- Respects Anthropic's 1024 token minimum and 4 breakpoint maximum
|
|
83
|
+
- Stabilizes prefixes by moving dates and normalizing whitespace
|
|
84
|
+
- Tracks metrics for monitoring and debugging
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, config: CacheConfig | None = None):
|
|
88
|
+
super().__init__(config)
|
|
89
|
+
if self.config.min_cacheable_tokens < ANTHROPIC_MIN_CACHEABLE_TOKENS:
|
|
90
|
+
self.config.min_cacheable_tokens = ANTHROPIC_MIN_CACHEABLE_TOKENS
|
|
91
|
+
if self.config.max_breakpoints > ANTHROPIC_MAX_BREAKPOINTS:
|
|
92
|
+
self.config.max_breakpoints = ANTHROPIC_MAX_BREAKPOINTS
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def name(self) -> str:
|
|
96
|
+
return "anthropic-cache-optimizer"
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def provider(self) -> str:
|
|
100
|
+
return "anthropic"
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def strategy(self) -> CacheStrategy:
|
|
104
|
+
return CacheStrategy.EXPLICIT_BREAKPOINTS
|
|
105
|
+
|
|
106
|
+
def optimize(
|
|
107
|
+
self,
|
|
108
|
+
messages: list[dict[str, Any]],
|
|
109
|
+
context: OptimizationContext,
|
|
110
|
+
config: CacheConfig | None = None,
|
|
111
|
+
) -> CacheResult:
|
|
112
|
+
"""
|
|
113
|
+
Optimize messages for Anthropic's cache.
|
|
114
|
+
|
|
115
|
+
Steps:
|
|
116
|
+
1. Analyze messages to identify cacheable sections
|
|
117
|
+
2. Stabilize the prefix (moves dates, normalizes whitespace)
|
|
118
|
+
3. Plan breakpoint placement
|
|
119
|
+
4. Insert cache_control blocks at optimal positions
|
|
120
|
+
5. Record metrics for monitoring
|
|
121
|
+
"""
|
|
122
|
+
effective_config = config or self.config
|
|
123
|
+
|
|
124
|
+
if not effective_config.enabled:
|
|
125
|
+
return CacheResult(
|
|
126
|
+
messages=messages,
|
|
127
|
+
metrics=CacheMetrics(),
|
|
128
|
+
transforms_applied=[],
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
optimized_messages = copy.deepcopy(messages)
|
|
132
|
+
transforms_applied: list[str] = []
|
|
133
|
+
warnings: list[str] = []
|
|
134
|
+
|
|
135
|
+
# Step 1: Analyze content sections
|
|
136
|
+
sections = self._analyze_sections(optimized_messages)
|
|
137
|
+
|
|
138
|
+
# Step 2: Stabilize prefix
|
|
139
|
+
optimized_messages, stabilization_applied = self._stabilize_prefix(
|
|
140
|
+
optimized_messages, effective_config
|
|
141
|
+
)
|
|
142
|
+
transforms_applied.extend(stabilization_applied)
|
|
143
|
+
|
|
144
|
+
# Step 3: Plan breakpoint placement
|
|
145
|
+
plan = self._plan_breakpoints(sections, effective_config)
|
|
146
|
+
warnings.extend(plan.warnings)
|
|
147
|
+
|
|
148
|
+
# Step 4: Insert cache_control blocks
|
|
149
|
+
optimized_messages = self._insert_breakpoints(optimized_messages, plan.breakpoints)
|
|
150
|
+
if plan.breakpoints:
|
|
151
|
+
transforms_applied.append(f"inserted_{len(plan.breakpoints)}_cache_breakpoints")
|
|
152
|
+
|
|
153
|
+
# Step 5: Compute metrics
|
|
154
|
+
prefix_content = self._extract_cacheable_content(optimized_messages)
|
|
155
|
+
prefix_hash = self._compute_prefix_hash(prefix_content)
|
|
156
|
+
|
|
157
|
+
cache_hit = False
|
|
158
|
+
if context.previous_prefix_hash:
|
|
159
|
+
cache_hit = prefix_hash == context.previous_prefix_hash
|
|
160
|
+
elif self._previous_prefix_hash:
|
|
161
|
+
cache_hit = prefix_hash == self._previous_prefix_hash
|
|
162
|
+
|
|
163
|
+
total_tokens = sum(s.token_count for s in sections)
|
|
164
|
+
cacheable_tokens = plan.total_cacheable_tokens
|
|
165
|
+
|
|
166
|
+
metrics = CacheMetrics(
|
|
167
|
+
stable_prefix_tokens=cacheable_tokens,
|
|
168
|
+
stable_prefix_hash=prefix_hash,
|
|
169
|
+
breakpoints_inserted=len(plan.breakpoints),
|
|
170
|
+
breakpoint_locations=plan.breakpoints,
|
|
171
|
+
prefix_changed_from_previous=not cache_hit,
|
|
172
|
+
previous_prefix_hash=self._previous_prefix_hash,
|
|
173
|
+
estimated_cache_hit=cache_hit,
|
|
174
|
+
estimated_savings_percent=plan.estimated_savings_percent if cache_hit else 0.0,
|
|
175
|
+
cacheable_tokens=cacheable_tokens,
|
|
176
|
+
non_cacheable_tokens=total_tokens - cacheable_tokens,
|
|
177
|
+
cache_ttl_remaining_seconds=ANTHROPIC_CACHE_TTL_SECONDS if cache_hit else None,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
self._previous_prefix_hash = prefix_hash
|
|
181
|
+
self._record_metrics(metrics)
|
|
182
|
+
|
|
183
|
+
return CacheResult(
|
|
184
|
+
messages=optimized_messages,
|
|
185
|
+
metrics=metrics,
|
|
186
|
+
tokens_before=total_tokens,
|
|
187
|
+
tokens_after=total_tokens,
|
|
188
|
+
transforms_applied=transforms_applied,
|
|
189
|
+
warnings=warnings,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def _analyze_sections(self, messages: list[dict[str, Any]]) -> list[ContentSection]:
|
|
193
|
+
"""Analyze messages to identify distinct content sections."""
|
|
194
|
+
sections: list[ContentSection] = []
|
|
195
|
+
|
|
196
|
+
for idx, message in enumerate(messages):
|
|
197
|
+
role = message.get("role", "")
|
|
198
|
+
content = message.get("content", "")
|
|
199
|
+
|
|
200
|
+
if role == "system":
|
|
201
|
+
section_type = "system"
|
|
202
|
+
elif role == "user":
|
|
203
|
+
section_type = (
|
|
204
|
+
"examples" if self._looks_like_example(message, messages, idx) else "user"
|
|
205
|
+
)
|
|
206
|
+
elif role == "assistant":
|
|
207
|
+
section_type = (
|
|
208
|
+
"examples" if self._looks_like_example(message, messages, idx) else "assistant"
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
section_type = role
|
|
212
|
+
|
|
213
|
+
# Handle tools
|
|
214
|
+
if "tools" in message:
|
|
215
|
+
tool_section = ContentSection(
|
|
216
|
+
content=message["tools"],
|
|
217
|
+
section_type="tools",
|
|
218
|
+
message_index=idx,
|
|
219
|
+
token_count=self._estimate_tools_tokens(message["tools"]),
|
|
220
|
+
is_cacheable=True,
|
|
221
|
+
reason="Tool definitions are static and cacheable",
|
|
222
|
+
)
|
|
223
|
+
sections.append(tool_section)
|
|
224
|
+
|
|
225
|
+
if isinstance(content, str):
|
|
226
|
+
token_count = self._count_tokens_estimate(content)
|
|
227
|
+
is_cacheable, reason = self._assess_cacheability(section_type, token_count, content)
|
|
228
|
+
sections.append(
|
|
229
|
+
ContentSection(
|
|
230
|
+
content=content,
|
|
231
|
+
section_type=section_type,
|
|
232
|
+
message_index=idx,
|
|
233
|
+
token_count=token_count,
|
|
234
|
+
is_cacheable=is_cacheable,
|
|
235
|
+
reason=reason,
|
|
236
|
+
)
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
elif isinstance(content, list):
|
|
240
|
+
for block_idx, block in enumerate(content):
|
|
241
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
242
|
+
text = block.get("text", "")
|
|
243
|
+
token_count = self._count_tokens_estimate(text)
|
|
244
|
+
is_cacheable, reason = self._assess_cacheability(
|
|
245
|
+
section_type, token_count, text
|
|
246
|
+
)
|
|
247
|
+
sections.append(
|
|
248
|
+
ContentSection(
|
|
249
|
+
content=block, # type: ignore[arg-type]
|
|
250
|
+
section_type=section_type,
|
|
251
|
+
message_index=idx,
|
|
252
|
+
content_index=block_idx,
|
|
253
|
+
token_count=token_count,
|
|
254
|
+
is_cacheable=is_cacheable,
|
|
255
|
+
reason=reason,
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return sections
|
|
260
|
+
|
|
261
|
+
def _assess_cacheability(
|
|
262
|
+
self, section_type: str, token_count: int, content: str
|
|
263
|
+
) -> tuple[bool, str]:
|
|
264
|
+
"""Assess whether a section is cacheable."""
|
|
265
|
+
if token_count < self.config.min_cacheable_tokens:
|
|
266
|
+
return (
|
|
267
|
+
False,
|
|
268
|
+
f"Below minimum tokens ({token_count} < {self.config.min_cacheable_tokens})",
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
if section_type == "system":
|
|
272
|
+
return True, "System prompts are highly cacheable"
|
|
273
|
+
if section_type == "tools":
|
|
274
|
+
return True, "Tool definitions are static and cacheable"
|
|
275
|
+
if section_type == "examples":
|
|
276
|
+
return True, "Few-shot examples are typically static"
|
|
277
|
+
if self._has_dynamic_content(content):
|
|
278
|
+
return False, "Contains dynamic content (dates, times, etc.)"
|
|
279
|
+
if section_type == "user":
|
|
280
|
+
return False, "User messages are typically dynamic"
|
|
281
|
+
|
|
282
|
+
return True, "Content is large enough for caching"
|
|
283
|
+
|
|
284
|
+
def _has_dynamic_content(self, content: str) -> bool:
|
|
285
|
+
"""Check if content has dynamic elements."""
|
|
286
|
+
for pattern in self.config.date_patterns:
|
|
287
|
+
if re.search(pattern, content):
|
|
288
|
+
return True
|
|
289
|
+
return False
|
|
290
|
+
|
|
291
|
+
def _looks_like_example(
|
|
292
|
+
self,
|
|
293
|
+
message: dict[str, Any],
|
|
294
|
+
messages: list[dict[str, Any]],
|
|
295
|
+
idx: int,
|
|
296
|
+
) -> bool:
|
|
297
|
+
"""Determine if a message looks like a few-shot example."""
|
|
298
|
+
system_idx = -1
|
|
299
|
+
for i, msg in enumerate(messages):
|
|
300
|
+
if msg.get("role") == "system":
|
|
301
|
+
system_idx = i
|
|
302
|
+
break
|
|
303
|
+
|
|
304
|
+
if system_idx >= 0 and idx <= system_idx + 4:
|
|
305
|
+
role = message.get("role")
|
|
306
|
+
if role == "user" and idx + 1 < len(messages):
|
|
307
|
+
if messages[idx + 1].get("role") == "assistant":
|
|
308
|
+
return True
|
|
309
|
+
elif role == "assistant" and idx > 0:
|
|
310
|
+
if messages[idx - 1].get("role") == "user":
|
|
311
|
+
return True
|
|
312
|
+
|
|
313
|
+
content = message.get("content", "")
|
|
314
|
+
if isinstance(content, str):
|
|
315
|
+
example_markers = ["example:", "for example", "e.g.", "sample:"]
|
|
316
|
+
return any(marker in content.lower() for marker in example_markers)
|
|
317
|
+
|
|
318
|
+
return False
|
|
319
|
+
|
|
320
|
+
def _estimate_tools_tokens(self, tools: Any) -> int:
|
|
321
|
+
"""Estimate token count for tool definitions."""
|
|
322
|
+
import json
|
|
323
|
+
|
|
324
|
+
try:
|
|
325
|
+
return self._count_tokens_estimate(json.dumps(tools))
|
|
326
|
+
except (TypeError, ValueError):
|
|
327
|
+
return 0
|
|
328
|
+
|
|
329
|
+
def _stabilize_prefix(
|
|
330
|
+
self,
|
|
331
|
+
messages: list[dict[str, Any]],
|
|
332
|
+
config: CacheConfig,
|
|
333
|
+
) -> tuple[list[dict[str, Any]], list[str]]:
|
|
334
|
+
"""Stabilize the prefix by moving dynamic content."""
|
|
335
|
+
transforms: list[str] = []
|
|
336
|
+
|
|
337
|
+
for message in messages:
|
|
338
|
+
if message.get("role") != "system":
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
content = message.get("content", "")
|
|
342
|
+
if isinstance(content, str):
|
|
343
|
+
new_content, applied = self._stabilize_text(content, config)
|
|
344
|
+
if new_content != content:
|
|
345
|
+
message["content"] = new_content
|
|
346
|
+
transforms.extend(applied)
|
|
347
|
+
|
|
348
|
+
elif isinstance(content, list):
|
|
349
|
+
for block in content:
|
|
350
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
351
|
+
text = block.get("text", "")
|
|
352
|
+
new_text, applied = self._stabilize_text(text, config)
|
|
353
|
+
if new_text != text:
|
|
354
|
+
block["text"] = new_text
|
|
355
|
+
transforms.extend(applied)
|
|
356
|
+
|
|
357
|
+
return messages, transforms
|
|
358
|
+
|
|
359
|
+
def _stabilize_text(self, text: str, config: CacheConfig) -> tuple[str, list[str]]:
|
|
360
|
+
"""Stabilize a text string."""
|
|
361
|
+
transforms: list[str] = []
|
|
362
|
+
result = text
|
|
363
|
+
|
|
364
|
+
extracted_dates: list[str] = []
|
|
365
|
+
for pattern in config.date_patterns:
|
|
366
|
+
matches = re.findall(pattern, result)
|
|
367
|
+
if matches:
|
|
368
|
+
extracted_dates.extend(matches)
|
|
369
|
+
result = re.sub(pattern, "", result)
|
|
370
|
+
transforms.append("extracted_dates")
|
|
371
|
+
|
|
372
|
+
if config.normalize_whitespace:
|
|
373
|
+
new_result = re.sub(r"[ \t]+", " ", result)
|
|
374
|
+
if new_result != result:
|
|
375
|
+
result = new_result
|
|
376
|
+
transforms.append("normalized_spaces")
|
|
377
|
+
|
|
378
|
+
if config.collapse_blank_lines:
|
|
379
|
+
new_result = re.sub(r"\n{3,}", "\n\n", result)
|
|
380
|
+
if new_result != result:
|
|
381
|
+
result = new_result
|
|
382
|
+
transforms.append("collapsed_blank_lines")
|
|
383
|
+
|
|
384
|
+
result = result.strip()
|
|
385
|
+
|
|
386
|
+
if extracted_dates:
|
|
387
|
+
result = result + config.dynamic_separator + " ".join(extracted_dates)
|
|
388
|
+
|
|
389
|
+
return result, list(set(transforms))
|
|
390
|
+
|
|
391
|
+
def _plan_breakpoints(
|
|
392
|
+
self,
|
|
393
|
+
sections: list[ContentSection],
|
|
394
|
+
config: CacheConfig,
|
|
395
|
+
) -> BreakpointPlan:
|
|
396
|
+
"""Plan where to place cache breakpoints."""
|
|
397
|
+
plan = BreakpointPlan()
|
|
398
|
+
|
|
399
|
+
cacheable = [s for s in sections if s.is_cacheable]
|
|
400
|
+
if not cacheable:
|
|
401
|
+
plan.warnings.append("No sections meet caching requirements")
|
|
402
|
+
return plan
|
|
403
|
+
|
|
404
|
+
priority_order = {"system": 0, "tools": 1, "examples": 2}
|
|
405
|
+
cacheable.sort(key=lambda s: priority_order.get(s.section_type, 3))
|
|
406
|
+
|
|
407
|
+
selected: list[ContentSection] = []
|
|
408
|
+
accumulated_tokens = 0
|
|
409
|
+
|
|
410
|
+
for section in cacheable:
|
|
411
|
+
if len(selected) >= config.max_breakpoints:
|
|
412
|
+
plan.warnings.append(f"Reached maximum breakpoints ({config.max_breakpoints})")
|
|
413
|
+
break
|
|
414
|
+
|
|
415
|
+
selected.append(section)
|
|
416
|
+
accumulated_tokens += section.token_count
|
|
417
|
+
|
|
418
|
+
for section in selected:
|
|
419
|
+
location = self._section_type_to_location(section.section_type)
|
|
420
|
+
breakpoint = CacheBreakpoint(
|
|
421
|
+
message_index=section.message_index,
|
|
422
|
+
location=location,
|
|
423
|
+
content_index=section.content_index,
|
|
424
|
+
tokens_at_breakpoint=section.token_count,
|
|
425
|
+
reason=section.reason,
|
|
426
|
+
)
|
|
427
|
+
plan.breakpoints.append(breakpoint)
|
|
428
|
+
|
|
429
|
+
plan.total_cacheable_tokens = accumulated_tokens
|
|
430
|
+
if accumulated_tokens > 0:
|
|
431
|
+
plan.estimated_savings_percent = 90.0
|
|
432
|
+
|
|
433
|
+
return plan
|
|
434
|
+
|
|
435
|
+
def _section_type_to_location(self, section_type: str) -> BreakpointLocation:
|
|
436
|
+
"""Convert section type to breakpoint location enum."""
|
|
437
|
+
mapping = {
|
|
438
|
+
"system": BreakpointLocation.AFTER_SYSTEM,
|
|
439
|
+
"tools": BreakpointLocation.AFTER_TOOLS,
|
|
440
|
+
"examples": BreakpointLocation.AFTER_EXAMPLES,
|
|
441
|
+
}
|
|
442
|
+
return mapping.get(section_type, BreakpointLocation.CUSTOM)
|
|
443
|
+
|
|
444
|
+
def _insert_breakpoints(
|
|
445
|
+
self,
|
|
446
|
+
messages: list[dict[str, Any]],
|
|
447
|
+
breakpoints: list[CacheBreakpoint],
|
|
448
|
+
) -> list[dict[str, Any]]:
|
|
449
|
+
"""Insert cache_control blocks at specified positions."""
|
|
450
|
+
for bp in breakpoints:
|
|
451
|
+
if bp.message_index >= len(messages):
|
|
452
|
+
continue
|
|
453
|
+
|
|
454
|
+
message = messages[bp.message_index]
|
|
455
|
+
content = message.get("content", "")
|
|
456
|
+
|
|
457
|
+
if isinstance(content, str):
|
|
458
|
+
message["content"] = [
|
|
459
|
+
{
|
|
460
|
+
"type": "text",
|
|
461
|
+
"text": content,
|
|
462
|
+
"cache_control": {"type": "ephemeral"},
|
|
463
|
+
}
|
|
464
|
+
]
|
|
465
|
+
elif isinstance(content, list):
|
|
466
|
+
if bp.content_index is not None and bp.content_index < len(content):
|
|
467
|
+
block = content[bp.content_index]
|
|
468
|
+
if isinstance(block, dict):
|
|
469
|
+
block["cache_control"] = {"type": "ephemeral"}
|
|
470
|
+
elif content:
|
|
471
|
+
last_block = content[-1]
|
|
472
|
+
if isinstance(last_block, dict):
|
|
473
|
+
last_block["cache_control"] = {"type": "ephemeral"}
|
|
474
|
+
|
|
475
|
+
return messages
|
|
476
|
+
|
|
477
|
+
def _extract_cacheable_content(self, messages: list[dict[str, Any]]) -> str:
|
|
478
|
+
"""Extract content that has cache_control markers for hashing."""
|
|
479
|
+
parts: list[str] = []
|
|
480
|
+
|
|
481
|
+
for message in messages:
|
|
482
|
+
content = message.get("content", "")
|
|
483
|
+
if isinstance(content, list):
|
|
484
|
+
for block in content:
|
|
485
|
+
if isinstance(block, dict) and "cache_control" in block:
|
|
486
|
+
text = block.get("text", "")
|
|
487
|
+
if text:
|
|
488
|
+
parts.append(text)
|
|
489
|
+
elif isinstance(content, str) and message.get("role") == "system":
|
|
490
|
+
parts.append(content)
|
|
491
|
+
|
|
492
|
+
return "\n".join(parts)
|
|
493
|
+
|
|
494
|
+
def estimate_savings(
|
|
495
|
+
self,
|
|
496
|
+
messages: list[dict[str, Any]],
|
|
497
|
+
context: OptimizationContext,
|
|
498
|
+
) -> float:
|
|
499
|
+
"""Estimate potential savings from caching."""
|
|
500
|
+
sections = self._analyze_sections(messages)
|
|
501
|
+
plan = self._plan_breakpoints(sections, self.config)
|
|
502
|
+
|
|
503
|
+
if plan.total_cacheable_tokens == 0:
|
|
504
|
+
return 0.0
|
|
505
|
+
|
|
506
|
+
total_tokens = sum(s.token_count for s in sections)
|
|
507
|
+
cacheable_ratio = plan.total_cacheable_tokens / total_tokens
|
|
508
|
+
return 90.0 * cacheable_ratio
|
|
509
|
+
|
|
510
|
+
def get_cache_write_cost_multiplier(self) -> float:
|
|
511
|
+
return ANTHROPIC_WRITE_COST_MULTIPLIER
|
|
512
|
+
|
|
513
|
+
def get_cache_read_cost_multiplier(self) -> float:
|
|
514
|
+
return ANTHROPIC_READ_COST_MULTIPLIER
|
|
515
|
+
|
|
516
|
+
def get_cache_ttl_seconds(self) -> int:
|
|
517
|
+
return ANTHROPIC_CACHE_TTL_SECONDS
|