headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
headroom/cache/openai.py
ADDED
|
@@ -0,0 +1,584 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenAI Cache Optimizer.
|
|
3
|
+
|
|
4
|
+
This module implements cache optimization for OpenAI's automatic prefix caching.
|
|
5
|
+
Unlike Anthropic, OpenAI's caching is fully automatic - users cannot control what
|
|
6
|
+
gets cached. The only optimization strategy is to stabilize prefixes to maximize
|
|
7
|
+
cache hit rates.
|
|
8
|
+
|
|
9
|
+
OpenAI Caching Details:
|
|
10
|
+
- Fully automatic - no explicit cache control available
|
|
11
|
+
- 50% discount on cached input tokens
|
|
12
|
+
- Requires prompts > 1024 tokens to activate
|
|
13
|
+
- 5-60 minute TTL (varies based on usage patterns)
|
|
14
|
+
- Cache is prefix-based - changes invalidate downstream cache
|
|
15
|
+
|
|
16
|
+
Optimization Strategy:
|
|
17
|
+
Since we can't control caching explicitly, we focus on PREFIX_STABILIZATION:
|
|
18
|
+
- Extract dynamic content (dates, timestamps) and move to end
|
|
19
|
+
- Normalize whitespace for consistent hashing
|
|
20
|
+
- Remove random IDs from system prompts
|
|
21
|
+
- Track prefix stability to estimate cache hit probability
|
|
22
|
+
|
|
23
|
+
Dynamic Content Detection Tiers:
|
|
24
|
+
- Tier 1 (regex): Always on, ~0ms - dates, UUIDs, timestamps
|
|
25
|
+
- Tier 2 (ner): Optional, ~5-10ms - names, money, organizations
|
|
26
|
+
- Tier 3 (semantic): Optional, ~20-50ms - volatile patterns via embeddings
|
|
27
|
+
|
|
28
|
+
Usage:
|
|
29
|
+
# Default: regex only (fastest)
|
|
30
|
+
optimizer = OpenAICacheOptimizer()
|
|
31
|
+
|
|
32
|
+
# With NER (requires spacy)
|
|
33
|
+
optimizer = OpenAICacheOptimizer(
|
|
34
|
+
config=CacheConfig(dynamic_detection_tiers=["regex", "ner"])
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Full detection (requires spacy + sentence-transformers)
|
|
38
|
+
optimizer = OpenAICacheOptimizer(
|
|
39
|
+
config=CacheConfig(dynamic_detection_tiers=["regex", "ner", "semantic"])
|
|
40
|
+
)
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
from copy import deepcopy
|
|
46
|
+
from dataclasses import dataclass, field
|
|
47
|
+
from typing import Any
|
|
48
|
+
|
|
49
|
+
from .base import (
|
|
50
|
+
BaseCacheOptimizer,
|
|
51
|
+
CacheConfig,
|
|
52
|
+
CacheMetrics,
|
|
53
|
+
CacheResult,
|
|
54
|
+
CacheStrategy,
|
|
55
|
+
OptimizationContext,
|
|
56
|
+
)
|
|
57
|
+
from .dynamic_detector import (
|
|
58
|
+
DetectorConfig,
|
|
59
|
+
DynamicContentDetector,
|
|
60
|
+
DynamicSpan,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class PrefixAnalysis:
|
|
66
|
+
"""
|
|
67
|
+
Analysis of prefix stability.
|
|
68
|
+
|
|
69
|
+
Used to determine likelihood of cache hits and track changes
|
|
70
|
+
between requests.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# Hash of the stabilized prefix
|
|
74
|
+
prefix_hash: str
|
|
75
|
+
|
|
76
|
+
# Estimated token count of stable prefix
|
|
77
|
+
stable_tokens: int
|
|
78
|
+
|
|
79
|
+
# Dynamic content that was extracted
|
|
80
|
+
dynamic_spans: list[DynamicSpan] = field(default_factory=list)
|
|
81
|
+
|
|
82
|
+
# Whether prefix changed from previous request
|
|
83
|
+
changed_from_previous: bool = False
|
|
84
|
+
|
|
85
|
+
# Previous hash for comparison
|
|
86
|
+
previous_hash: str | None = None
|
|
87
|
+
|
|
88
|
+
# Detection processing time
|
|
89
|
+
detection_time_ms: float = 0.0
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class OpenAICacheOptimizer(BaseCacheOptimizer):
|
|
93
|
+
"""
|
|
94
|
+
Cache optimizer for OpenAI's automatic prefix caching.
|
|
95
|
+
|
|
96
|
+
OpenAI automatically caches prompt prefixes for requests > 1024 tokens.
|
|
97
|
+
Since caching is automatic, this optimizer focuses on maximizing cache
|
|
98
|
+
hit rates by stabilizing prefixes.
|
|
99
|
+
|
|
100
|
+
Key Optimizations:
|
|
101
|
+
1. Extract dynamic content (dates, times) and move to end of messages
|
|
102
|
+
2. Normalize whitespace for consistent formatting
|
|
103
|
+
3. Remove random IDs and timestamps from system prompts
|
|
104
|
+
4. Track prefix changes to estimate cache hit probability
|
|
105
|
+
|
|
106
|
+
Usage:
|
|
107
|
+
optimizer = OpenAICacheOptimizer()
|
|
108
|
+
result = optimizer.optimize(messages, context)
|
|
109
|
+
|
|
110
|
+
# Check if prefix was stable (likely cache hit)
|
|
111
|
+
if not result.metrics.prefix_changed_from_previous:
|
|
112
|
+
print("Likely cache hit - prefix unchanged")
|
|
113
|
+
|
|
114
|
+
# Estimate savings
|
|
115
|
+
savings = result.metrics.estimated_savings_percent
|
|
116
|
+
print(f"Estimated savings: {savings:.1f}%")
|
|
117
|
+
|
|
118
|
+
Attributes:
|
|
119
|
+
name: Identifier for this optimizer
|
|
120
|
+
provider: The provider this optimizer targets ("openai")
|
|
121
|
+
strategy: Always CacheStrategy.PREFIX_STABILIZATION
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
# OpenAI-specific constants
|
|
125
|
+
MIN_TOKENS_FOR_CACHING = 1024
|
|
126
|
+
CACHE_DISCOUNT_PERCENT = 50.0
|
|
127
|
+
|
|
128
|
+
def __init__(self, config: CacheConfig | None = None):
|
|
129
|
+
"""
|
|
130
|
+
Initialize the OpenAI cache optimizer.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
config: Optional cache configuration. If not provided,
|
|
134
|
+
sensible defaults are used.
|
|
135
|
+
|
|
136
|
+
The optimizer uses the DynamicContentDetector with configurable tiers:
|
|
137
|
+
- "regex": Fast pattern matching (~0ms) - always on
|
|
138
|
+
- "ner": Named Entity Recognition (~5-10ms) - requires spacy
|
|
139
|
+
- "semantic": Embedding similarity (~20-50ms) - requires sentence-transformers
|
|
140
|
+
|
|
141
|
+
Configure tiers via config.dynamic_detection_tiers.
|
|
142
|
+
"""
|
|
143
|
+
super().__init__(config)
|
|
144
|
+
|
|
145
|
+
# Initialize the tiered dynamic content detector
|
|
146
|
+
detector_config = DetectorConfig(
|
|
147
|
+
tiers=self.config.dynamic_detection_tiers, # type: ignore
|
|
148
|
+
)
|
|
149
|
+
self._detector = DynamicContentDetector(detector_config)
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def name(self) -> str:
|
|
153
|
+
"""Name of this optimizer."""
|
|
154
|
+
return "openai-prefix-stabilizer"
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def provider(self) -> str:
|
|
158
|
+
"""Provider this optimizer is for."""
|
|
159
|
+
return "openai"
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def strategy(self) -> CacheStrategy:
|
|
163
|
+
"""The caching strategy this optimizer uses."""
|
|
164
|
+
return CacheStrategy.PREFIX_STABILIZATION
|
|
165
|
+
|
|
166
|
+
def optimize(
|
|
167
|
+
self,
|
|
168
|
+
messages: list[dict[str, Any]],
|
|
169
|
+
context: OptimizationContext,
|
|
170
|
+
config: CacheConfig | None = None,
|
|
171
|
+
) -> CacheResult:
|
|
172
|
+
"""
|
|
173
|
+
Optimize messages for OpenAI's prefix caching.
|
|
174
|
+
|
|
175
|
+
This method stabilizes the message prefix to maximize cache hit rates.
|
|
176
|
+
Since OpenAI caching is automatic, we focus on ensuring the prefix
|
|
177
|
+
remains consistent across requests.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
messages: List of message dictionaries in OpenAI format.
|
|
181
|
+
context: Optimization context with request metadata.
|
|
182
|
+
config: Optional configuration override.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
CacheResult containing:
|
|
186
|
+
- Optimized messages with stabilized prefixes
|
|
187
|
+
- Metrics about prefix stability and estimated savings
|
|
188
|
+
- List of transforms applied
|
|
189
|
+
- Any warnings encountered
|
|
190
|
+
|
|
191
|
+
Example:
|
|
192
|
+
>>> optimizer = OpenAICacheOptimizer()
|
|
193
|
+
>>> messages = [
|
|
194
|
+
... {"role": "system", "content": "Today is Jan 1, 2024. You are helpful."},
|
|
195
|
+
... {"role": "user", "content": "Hello!"}
|
|
196
|
+
... ]
|
|
197
|
+
>>> context = OptimizationContext(provider="openai", model="gpt-4")
|
|
198
|
+
>>> result = optimizer.optimize(messages, context)
|
|
199
|
+
>>> # Date moved to end, prefix stabilized
|
|
200
|
+
"""
|
|
201
|
+
effective_config = config or self.config
|
|
202
|
+
|
|
203
|
+
# Handle disabled optimization
|
|
204
|
+
if not effective_config.enabled:
|
|
205
|
+
return CacheResult(
|
|
206
|
+
messages=messages,
|
|
207
|
+
metrics=CacheMetrics(),
|
|
208
|
+
transforms_applied=[],
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Deep copy to avoid mutating input
|
|
212
|
+
optimized_messages = deepcopy(messages)
|
|
213
|
+
transforms_applied: list[str] = []
|
|
214
|
+
warnings: list[str] = []
|
|
215
|
+
|
|
216
|
+
# Track all extracted spans across messages
|
|
217
|
+
all_spans: list[DynamicSpan] = []
|
|
218
|
+
total_detection_time = 0.0
|
|
219
|
+
|
|
220
|
+
# Process system messages for prefix stabilization
|
|
221
|
+
for i, msg in enumerate(optimized_messages):
|
|
222
|
+
if msg.get("role") == "system":
|
|
223
|
+
content = msg.get("content", "")
|
|
224
|
+
|
|
225
|
+
if isinstance(content, str):
|
|
226
|
+
# Use tiered dynamic content detector
|
|
227
|
+
result = self._detector.detect(content)
|
|
228
|
+
all_spans.extend(result.spans)
|
|
229
|
+
total_detection_time += result.processing_time_ms
|
|
230
|
+
|
|
231
|
+
# Add any detector warnings
|
|
232
|
+
warnings.extend(result.warnings)
|
|
233
|
+
|
|
234
|
+
if result.spans:
|
|
235
|
+
transforms_applied.append(f"extracted_{len(result.spans)}_dynamic_elements")
|
|
236
|
+
transforms_applied.extend(f"tier_{tier}" for tier in result.tiers_used)
|
|
237
|
+
|
|
238
|
+
# Get static content with dynamic parts removed
|
|
239
|
+
stabilized = result.static_content
|
|
240
|
+
|
|
241
|
+
# Normalize whitespace
|
|
242
|
+
if effective_config.normalize_whitespace:
|
|
243
|
+
stabilized = self._normalize_whitespace(
|
|
244
|
+
stabilized,
|
|
245
|
+
collapse_blank_lines=effective_config.collapse_blank_lines,
|
|
246
|
+
)
|
|
247
|
+
transforms_applied.append("normalized_whitespace")
|
|
248
|
+
|
|
249
|
+
# If we extracted dynamic content, append it at the end
|
|
250
|
+
if result.dynamic_content:
|
|
251
|
+
dynamic_section = self._format_dynamic_section(
|
|
252
|
+
result.dynamic_content,
|
|
253
|
+
separator=effective_config.dynamic_separator,
|
|
254
|
+
)
|
|
255
|
+
stabilized = stabilized.rstrip() + dynamic_section
|
|
256
|
+
|
|
257
|
+
optimized_messages[i]["content"] = stabilized
|
|
258
|
+
|
|
259
|
+
elif isinstance(content, list):
|
|
260
|
+
# Handle content blocks (less common for OpenAI)
|
|
261
|
+
new_content = []
|
|
262
|
+
for block in content:
|
|
263
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
264
|
+
text = block.get("text", "")
|
|
265
|
+
result = self._detector.detect(text)
|
|
266
|
+
all_spans.extend(result.spans)
|
|
267
|
+
total_detection_time += result.processing_time_ms
|
|
268
|
+
warnings.extend(result.warnings)
|
|
269
|
+
|
|
270
|
+
stabilized = result.static_content
|
|
271
|
+
|
|
272
|
+
if effective_config.normalize_whitespace:
|
|
273
|
+
stabilized = self._normalize_whitespace(stabilized)
|
|
274
|
+
|
|
275
|
+
if result.dynamic_content:
|
|
276
|
+
dynamic_section = self._format_dynamic_section(
|
|
277
|
+
result.dynamic_content,
|
|
278
|
+
separator=effective_config.dynamic_separator,
|
|
279
|
+
)
|
|
280
|
+
stabilized = stabilized.rstrip() + dynamic_section
|
|
281
|
+
|
|
282
|
+
new_content.append({**block, "text": stabilized})
|
|
283
|
+
else:
|
|
284
|
+
new_content.append(block)
|
|
285
|
+
|
|
286
|
+
optimized_messages[i]["content"] = new_content
|
|
287
|
+
if all_spans:
|
|
288
|
+
transforms_applied.append("processed_content_blocks")
|
|
289
|
+
|
|
290
|
+
# Analyze prefix stability
|
|
291
|
+
analysis = self._analyze_prefix(optimized_messages, context)
|
|
292
|
+
|
|
293
|
+
# Calculate token estimates
|
|
294
|
+
tokens_before = self._estimate_total_tokens(messages)
|
|
295
|
+
tokens_after = self._estimate_total_tokens(optimized_messages)
|
|
296
|
+
|
|
297
|
+
# Build metrics
|
|
298
|
+
metrics = CacheMetrics(
|
|
299
|
+
stable_prefix_tokens=analysis.stable_tokens,
|
|
300
|
+
stable_prefix_hash=analysis.prefix_hash,
|
|
301
|
+
prefix_changed_from_previous=analysis.changed_from_previous,
|
|
302
|
+
previous_prefix_hash=analysis.previous_hash,
|
|
303
|
+
estimated_cache_hit=not analysis.changed_from_previous,
|
|
304
|
+
cacheable_tokens=self._calculate_cacheable_tokens(analysis.stable_tokens),
|
|
305
|
+
non_cacheable_tokens=max(0, tokens_after - analysis.stable_tokens),
|
|
306
|
+
estimated_savings_percent=self._calculate_savings_percent(
|
|
307
|
+
analysis.stable_tokens,
|
|
308
|
+
tokens_after,
|
|
309
|
+
likely_cache_hit=not analysis.changed_from_previous,
|
|
310
|
+
),
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Add warnings for suboptimal cases
|
|
314
|
+
if tokens_after < self.MIN_TOKENS_FOR_CACHING:
|
|
315
|
+
warnings.append(
|
|
316
|
+
f"Prompt has ~{tokens_after} tokens, below OpenAI's {self.MIN_TOKENS_FOR_CACHING} "
|
|
317
|
+
f"token minimum for caching. Consider adding more static context."
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
if analysis.changed_from_previous:
|
|
321
|
+
warnings.append(
|
|
322
|
+
"Prefix changed from previous request - cache miss likely. "
|
|
323
|
+
"Consider reviewing what content is changing between requests."
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Record metrics and update state
|
|
327
|
+
self._record_metrics(metrics)
|
|
328
|
+
self._previous_prefix_hash = analysis.prefix_hash
|
|
329
|
+
|
|
330
|
+
return CacheResult(
|
|
331
|
+
messages=optimized_messages,
|
|
332
|
+
metrics=metrics,
|
|
333
|
+
tokens_before=tokens_before,
|
|
334
|
+
tokens_after=tokens_after,
|
|
335
|
+
transforms_applied=list(set(transforms_applied)), # Dedupe
|
|
336
|
+
warnings=warnings,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
def estimate_savings(
|
|
340
|
+
self,
|
|
341
|
+
messages: list[dict[str, Any]],
|
|
342
|
+
context: OptimizationContext,
|
|
343
|
+
) -> float:
|
|
344
|
+
"""
|
|
345
|
+
Estimate potential cost savings from caching.
|
|
346
|
+
|
|
347
|
+
OpenAI provides 50% discount on cached tokens. This method estimates
|
|
348
|
+
what portion of tokens are likely to be cached based on prefix
|
|
349
|
+
stability and token count.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
messages: Messages to analyze.
|
|
353
|
+
context: Optimization context.
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Estimated savings as a percentage (0-100).
|
|
357
|
+
Returns 0 if prompt is below caching threshold.
|
|
358
|
+
|
|
359
|
+
Example:
|
|
360
|
+
>>> savings = optimizer.estimate_savings(messages, context)
|
|
361
|
+
>>> print(f"Potential savings: {savings:.1f}%")
|
|
362
|
+
"""
|
|
363
|
+
total_tokens = self._estimate_total_tokens(messages)
|
|
364
|
+
|
|
365
|
+
# No savings if below threshold
|
|
366
|
+
if total_tokens < self.MIN_TOKENS_FOR_CACHING:
|
|
367
|
+
return 0.0
|
|
368
|
+
|
|
369
|
+
# Extract system content for prefix analysis
|
|
370
|
+
system_content = self._extract_system_content(messages)
|
|
371
|
+
system_tokens = self._count_tokens_estimate(system_content)
|
|
372
|
+
|
|
373
|
+
# Estimate cacheable portion (system + early messages)
|
|
374
|
+
# OpenAI caches the longest matching prefix
|
|
375
|
+
cacheable_ratio = min(1.0, system_tokens / total_tokens)
|
|
376
|
+
|
|
377
|
+
# Check if prefix is stable
|
|
378
|
+
current_hash = self._compute_prefix_hash(system_content)
|
|
379
|
+
likely_hit = (
|
|
380
|
+
self._previous_prefix_hash is not None and current_hash == self._previous_prefix_hash
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
if likely_hit:
|
|
384
|
+
# 50% savings on cacheable portion
|
|
385
|
+
return cacheable_ratio * self.CACHE_DISCOUNT_PERCENT
|
|
386
|
+
else:
|
|
387
|
+
# First request or prefix changed - no immediate savings
|
|
388
|
+
# but return expected savings for future requests
|
|
389
|
+
return cacheable_ratio * self.CACHE_DISCOUNT_PERCENT * 0.5
|
|
390
|
+
|
|
391
|
+
def _normalize_whitespace(
|
|
392
|
+
self,
|
|
393
|
+
content: str,
|
|
394
|
+
collapse_blank_lines: bool = True,
|
|
395
|
+
) -> str:
|
|
396
|
+
"""
|
|
397
|
+
Normalize whitespace in content.
|
|
398
|
+
|
|
399
|
+
Ensures consistent whitespace formatting to improve prefix matching.
|
|
400
|
+
This helps when the same logical content has minor formatting differences.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
content: Text to normalize.
|
|
404
|
+
collapse_blank_lines: If True, multiple blank lines become one.
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
Content with normalized whitespace.
|
|
408
|
+
"""
|
|
409
|
+
# Normalize line endings
|
|
410
|
+
result = content.replace("\r\n", "\n").replace("\r", "\n")
|
|
411
|
+
|
|
412
|
+
# Collapse multiple spaces (but preserve indentation)
|
|
413
|
+
lines = result.split("\n")
|
|
414
|
+
normalized_lines = []
|
|
415
|
+
|
|
416
|
+
for line in lines:
|
|
417
|
+
# Preserve leading whitespace, normalize trailing
|
|
418
|
+
stripped = line.rstrip()
|
|
419
|
+
if stripped:
|
|
420
|
+
# Find leading whitespace
|
|
421
|
+
leading = len(line) - len(line.lstrip())
|
|
422
|
+
# Collapse multiple spaces in content (not indentation)
|
|
423
|
+
content_part = " ".join(stripped.split())
|
|
424
|
+
normalized_lines.append(
|
|
425
|
+
" " * leading + content_part[leading:] if leading else content_part
|
|
426
|
+
)
|
|
427
|
+
else:
|
|
428
|
+
normalized_lines.append("")
|
|
429
|
+
|
|
430
|
+
result = "\n".join(normalized_lines)
|
|
431
|
+
|
|
432
|
+
# Collapse multiple blank lines
|
|
433
|
+
if collapse_blank_lines:
|
|
434
|
+
while "\n\n\n" in result:
|
|
435
|
+
result = result.replace("\n\n\n", "\n\n")
|
|
436
|
+
|
|
437
|
+
return result.strip()
|
|
438
|
+
|
|
439
|
+
def _format_dynamic_section(
|
|
440
|
+
self,
|
|
441
|
+
dynamic_content: str,
|
|
442
|
+
separator: str = "\n\n---\n\n",
|
|
443
|
+
) -> str:
|
|
444
|
+
"""
|
|
445
|
+
Format extracted dynamic content as a section to append.
|
|
446
|
+
|
|
447
|
+
Creates a clearly marked section containing dynamic values,
|
|
448
|
+
appended to the end of the message to preserve prefix stability.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
dynamic_content: The dynamic content string to append.
|
|
452
|
+
separator: Separator to use before the dynamic section.
|
|
453
|
+
|
|
454
|
+
Returns:
|
|
455
|
+
Formatted dynamic section string.
|
|
456
|
+
"""
|
|
457
|
+
if not dynamic_content or not dynamic_content.strip():
|
|
458
|
+
return ""
|
|
459
|
+
|
|
460
|
+
# Format as a context section
|
|
461
|
+
return f"{separator}[Current Context]\n{dynamic_content.strip()}\n"
|
|
462
|
+
|
|
463
|
+
def _analyze_prefix(
|
|
464
|
+
self,
|
|
465
|
+
messages: list[dict[str, Any]],
|
|
466
|
+
context: OptimizationContext,
|
|
467
|
+
) -> PrefixAnalysis:
|
|
468
|
+
"""
|
|
469
|
+
Analyze the prefix for stability metrics.
|
|
470
|
+
|
|
471
|
+
Computes hash of the stable prefix portion and compares with
|
|
472
|
+
previous requests to estimate cache hit likelihood.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
messages: Messages to analyze.
|
|
476
|
+
context: Optimization context with previous hash.
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
PrefixAnalysis with stability metrics.
|
|
480
|
+
"""
|
|
481
|
+
# Extract prefix content (system messages + structure)
|
|
482
|
+
prefix_parts = []
|
|
483
|
+
|
|
484
|
+
for msg in messages:
|
|
485
|
+
if msg.get("role") == "system":
|
|
486
|
+
content = msg.get("content", "")
|
|
487
|
+
if isinstance(content, str):
|
|
488
|
+
prefix_parts.append(content)
|
|
489
|
+
elif isinstance(content, list):
|
|
490
|
+
for block in content:
|
|
491
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
492
|
+
prefix_parts.append(block.get("text", ""))
|
|
493
|
+
|
|
494
|
+
prefix_content = "\n".join(prefix_parts)
|
|
495
|
+
prefix_hash = self._compute_prefix_hash(prefix_content)
|
|
496
|
+
stable_tokens = self._count_tokens_estimate(prefix_content)
|
|
497
|
+
|
|
498
|
+
# Check for changes from previous request
|
|
499
|
+
previous_hash = context.previous_prefix_hash or self._previous_prefix_hash
|
|
500
|
+
changed = previous_hash is not None and prefix_hash != previous_hash
|
|
501
|
+
|
|
502
|
+
return PrefixAnalysis(
|
|
503
|
+
prefix_hash=prefix_hash,
|
|
504
|
+
stable_tokens=stable_tokens,
|
|
505
|
+
changed_from_previous=changed,
|
|
506
|
+
previous_hash=previous_hash,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
def _calculate_cacheable_tokens(self, stable_prefix_tokens: int) -> int:
|
|
510
|
+
"""
|
|
511
|
+
Calculate how many tokens are likely cacheable.
|
|
512
|
+
|
|
513
|
+
OpenAI only caches prompts > 1024 tokens, and caches in chunks.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
stable_prefix_tokens: Number of tokens in stable prefix.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
Estimated cacheable token count.
|
|
520
|
+
"""
|
|
521
|
+
if stable_prefix_tokens < self.MIN_TOKENS_FOR_CACHING:
|
|
522
|
+
return 0
|
|
523
|
+
|
|
524
|
+
# OpenAI caches in 128-token chunks (aligned)
|
|
525
|
+
# Return the aligned cacheable amount
|
|
526
|
+
return (stable_prefix_tokens // 128) * 128
|
|
527
|
+
|
|
528
|
+
def _calculate_savings_percent(
|
|
529
|
+
self,
|
|
530
|
+
stable_tokens: int,
|
|
531
|
+
total_tokens: int,
|
|
532
|
+
likely_cache_hit: bool,
|
|
533
|
+
) -> float:
|
|
534
|
+
"""
|
|
535
|
+
Calculate estimated savings percentage.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
stable_tokens: Tokens in stable prefix.
|
|
539
|
+
total_tokens: Total tokens in request.
|
|
540
|
+
likely_cache_hit: Whether a cache hit is likely.
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
Estimated savings as percentage (0-100).
|
|
544
|
+
"""
|
|
545
|
+
if total_tokens == 0:
|
|
546
|
+
return 0.0
|
|
547
|
+
|
|
548
|
+
cacheable = self._calculate_cacheable_tokens(stable_tokens)
|
|
549
|
+
if cacheable == 0:
|
|
550
|
+
return 0.0
|
|
551
|
+
|
|
552
|
+
cacheable_ratio = cacheable / total_tokens
|
|
553
|
+
|
|
554
|
+
if likely_cache_hit:
|
|
555
|
+
# Full 50% savings on cacheable portion
|
|
556
|
+
return cacheable_ratio * self.CACHE_DISCOUNT_PERCENT
|
|
557
|
+
else:
|
|
558
|
+
# No savings on first request, but show potential
|
|
559
|
+
return 0.0
|
|
560
|
+
|
|
561
|
+
def _estimate_total_tokens(self, messages: list[dict[str, Any]]) -> int:
|
|
562
|
+
"""
|
|
563
|
+
Estimate total tokens in messages.
|
|
564
|
+
|
|
565
|
+
Args:
|
|
566
|
+
messages: Messages to count.
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
Estimated token count.
|
|
570
|
+
"""
|
|
571
|
+
total = 0
|
|
572
|
+
for msg in messages:
|
|
573
|
+
content = msg.get("content", "")
|
|
574
|
+
if isinstance(content, str):
|
|
575
|
+
total += self._count_tokens_estimate(content)
|
|
576
|
+
elif isinstance(content, list):
|
|
577
|
+
for block in content:
|
|
578
|
+
if isinstance(block, dict):
|
|
579
|
+
if block.get("type") == "text":
|
|
580
|
+
total += self._count_tokens_estimate(block.get("text", ""))
|
|
581
|
+
elif block.get("type") == "image_url":
|
|
582
|
+
# Rough estimate for images
|
|
583
|
+
total += 85 # Base cost
|
|
584
|
+
return total
|