headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,584 @@
1
+ """
2
+ OpenAI Cache Optimizer.
3
+
4
+ This module implements cache optimization for OpenAI's automatic prefix caching.
5
+ Unlike Anthropic, OpenAI's caching is fully automatic - users cannot control what
6
+ gets cached. The only optimization strategy is to stabilize prefixes to maximize
7
+ cache hit rates.
8
+
9
+ OpenAI Caching Details:
10
+ - Fully automatic - no explicit cache control available
11
+ - 50% discount on cached input tokens
12
+ - Requires prompts > 1024 tokens to activate
13
+ - 5-60 minute TTL (varies based on usage patterns)
14
+ - Cache is prefix-based - changes invalidate downstream cache
15
+
16
+ Optimization Strategy:
17
+ Since we can't control caching explicitly, we focus on PREFIX_STABILIZATION:
18
+ - Extract dynamic content (dates, timestamps) and move to end
19
+ - Normalize whitespace for consistent hashing
20
+ - Remove random IDs from system prompts
21
+ - Track prefix stability to estimate cache hit probability
22
+
23
+ Dynamic Content Detection Tiers:
24
+ - Tier 1 (regex): Always on, ~0ms - dates, UUIDs, timestamps
25
+ - Tier 2 (ner): Optional, ~5-10ms - names, money, organizations
26
+ - Tier 3 (semantic): Optional, ~20-50ms - volatile patterns via embeddings
27
+
28
+ Usage:
29
+ # Default: regex only (fastest)
30
+ optimizer = OpenAICacheOptimizer()
31
+
32
+ # With NER (requires spacy)
33
+ optimizer = OpenAICacheOptimizer(
34
+ config=CacheConfig(dynamic_detection_tiers=["regex", "ner"])
35
+ )
36
+
37
+ # Full detection (requires spacy + sentence-transformers)
38
+ optimizer = OpenAICacheOptimizer(
39
+ config=CacheConfig(dynamic_detection_tiers=["regex", "ner", "semantic"])
40
+ )
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ from copy import deepcopy
46
+ from dataclasses import dataclass, field
47
+ from typing import Any
48
+
49
+ from .base import (
50
+ BaseCacheOptimizer,
51
+ CacheConfig,
52
+ CacheMetrics,
53
+ CacheResult,
54
+ CacheStrategy,
55
+ OptimizationContext,
56
+ )
57
+ from .dynamic_detector import (
58
+ DetectorConfig,
59
+ DynamicContentDetector,
60
+ DynamicSpan,
61
+ )
62
+
63
+
64
+ @dataclass
65
+ class PrefixAnalysis:
66
+ """
67
+ Analysis of prefix stability.
68
+
69
+ Used to determine likelihood of cache hits and track changes
70
+ between requests.
71
+ """
72
+
73
+ # Hash of the stabilized prefix
74
+ prefix_hash: str
75
+
76
+ # Estimated token count of stable prefix
77
+ stable_tokens: int
78
+
79
+ # Dynamic content that was extracted
80
+ dynamic_spans: list[DynamicSpan] = field(default_factory=list)
81
+
82
+ # Whether prefix changed from previous request
83
+ changed_from_previous: bool = False
84
+
85
+ # Previous hash for comparison
86
+ previous_hash: str | None = None
87
+
88
+ # Detection processing time
89
+ detection_time_ms: float = 0.0
90
+
91
+
92
+ class OpenAICacheOptimizer(BaseCacheOptimizer):
93
+ """
94
+ Cache optimizer for OpenAI's automatic prefix caching.
95
+
96
+ OpenAI automatically caches prompt prefixes for requests > 1024 tokens.
97
+ Since caching is automatic, this optimizer focuses on maximizing cache
98
+ hit rates by stabilizing prefixes.
99
+
100
+ Key Optimizations:
101
+ 1. Extract dynamic content (dates, times) and move to end of messages
102
+ 2. Normalize whitespace for consistent formatting
103
+ 3. Remove random IDs and timestamps from system prompts
104
+ 4. Track prefix changes to estimate cache hit probability
105
+
106
+ Usage:
107
+ optimizer = OpenAICacheOptimizer()
108
+ result = optimizer.optimize(messages, context)
109
+
110
+ # Check if prefix was stable (likely cache hit)
111
+ if not result.metrics.prefix_changed_from_previous:
112
+ print("Likely cache hit - prefix unchanged")
113
+
114
+ # Estimate savings
115
+ savings = result.metrics.estimated_savings_percent
116
+ print(f"Estimated savings: {savings:.1f}%")
117
+
118
+ Attributes:
119
+ name: Identifier for this optimizer
120
+ provider: The provider this optimizer targets ("openai")
121
+ strategy: Always CacheStrategy.PREFIX_STABILIZATION
122
+ """
123
+
124
+ # OpenAI-specific constants
125
+ MIN_TOKENS_FOR_CACHING = 1024
126
+ CACHE_DISCOUNT_PERCENT = 50.0
127
+
128
+ def __init__(self, config: CacheConfig | None = None):
129
+ """
130
+ Initialize the OpenAI cache optimizer.
131
+
132
+ Args:
133
+ config: Optional cache configuration. If not provided,
134
+ sensible defaults are used.
135
+
136
+ The optimizer uses the DynamicContentDetector with configurable tiers:
137
+ - "regex": Fast pattern matching (~0ms) - always on
138
+ - "ner": Named Entity Recognition (~5-10ms) - requires spacy
139
+ - "semantic": Embedding similarity (~20-50ms) - requires sentence-transformers
140
+
141
+ Configure tiers via config.dynamic_detection_tiers.
142
+ """
143
+ super().__init__(config)
144
+
145
+ # Initialize the tiered dynamic content detector
146
+ detector_config = DetectorConfig(
147
+ tiers=self.config.dynamic_detection_tiers, # type: ignore
148
+ )
149
+ self._detector = DynamicContentDetector(detector_config)
150
+
151
+ @property
152
+ def name(self) -> str:
153
+ """Name of this optimizer."""
154
+ return "openai-prefix-stabilizer"
155
+
156
+ @property
157
+ def provider(self) -> str:
158
+ """Provider this optimizer is for."""
159
+ return "openai"
160
+
161
+ @property
162
+ def strategy(self) -> CacheStrategy:
163
+ """The caching strategy this optimizer uses."""
164
+ return CacheStrategy.PREFIX_STABILIZATION
165
+
166
+ def optimize(
167
+ self,
168
+ messages: list[dict[str, Any]],
169
+ context: OptimizationContext,
170
+ config: CacheConfig | None = None,
171
+ ) -> CacheResult:
172
+ """
173
+ Optimize messages for OpenAI's prefix caching.
174
+
175
+ This method stabilizes the message prefix to maximize cache hit rates.
176
+ Since OpenAI caching is automatic, we focus on ensuring the prefix
177
+ remains consistent across requests.
178
+
179
+ Args:
180
+ messages: List of message dictionaries in OpenAI format.
181
+ context: Optimization context with request metadata.
182
+ config: Optional configuration override.
183
+
184
+ Returns:
185
+ CacheResult containing:
186
+ - Optimized messages with stabilized prefixes
187
+ - Metrics about prefix stability and estimated savings
188
+ - List of transforms applied
189
+ - Any warnings encountered
190
+
191
+ Example:
192
+ >>> optimizer = OpenAICacheOptimizer()
193
+ >>> messages = [
194
+ ... {"role": "system", "content": "Today is Jan 1, 2024. You are helpful."},
195
+ ... {"role": "user", "content": "Hello!"}
196
+ ... ]
197
+ >>> context = OptimizationContext(provider="openai", model="gpt-4")
198
+ >>> result = optimizer.optimize(messages, context)
199
+ >>> # Date moved to end, prefix stabilized
200
+ """
201
+ effective_config = config or self.config
202
+
203
+ # Handle disabled optimization
204
+ if not effective_config.enabled:
205
+ return CacheResult(
206
+ messages=messages,
207
+ metrics=CacheMetrics(),
208
+ transforms_applied=[],
209
+ )
210
+
211
+ # Deep copy to avoid mutating input
212
+ optimized_messages = deepcopy(messages)
213
+ transforms_applied: list[str] = []
214
+ warnings: list[str] = []
215
+
216
+ # Track all extracted spans across messages
217
+ all_spans: list[DynamicSpan] = []
218
+ total_detection_time = 0.0
219
+
220
+ # Process system messages for prefix stabilization
221
+ for i, msg in enumerate(optimized_messages):
222
+ if msg.get("role") == "system":
223
+ content = msg.get("content", "")
224
+
225
+ if isinstance(content, str):
226
+ # Use tiered dynamic content detector
227
+ result = self._detector.detect(content)
228
+ all_spans.extend(result.spans)
229
+ total_detection_time += result.processing_time_ms
230
+
231
+ # Add any detector warnings
232
+ warnings.extend(result.warnings)
233
+
234
+ if result.spans:
235
+ transforms_applied.append(f"extracted_{len(result.spans)}_dynamic_elements")
236
+ transforms_applied.extend(f"tier_{tier}" for tier in result.tiers_used)
237
+
238
+ # Get static content with dynamic parts removed
239
+ stabilized = result.static_content
240
+
241
+ # Normalize whitespace
242
+ if effective_config.normalize_whitespace:
243
+ stabilized = self._normalize_whitespace(
244
+ stabilized,
245
+ collapse_blank_lines=effective_config.collapse_blank_lines,
246
+ )
247
+ transforms_applied.append("normalized_whitespace")
248
+
249
+ # If we extracted dynamic content, append it at the end
250
+ if result.dynamic_content:
251
+ dynamic_section = self._format_dynamic_section(
252
+ result.dynamic_content,
253
+ separator=effective_config.dynamic_separator,
254
+ )
255
+ stabilized = stabilized.rstrip() + dynamic_section
256
+
257
+ optimized_messages[i]["content"] = stabilized
258
+
259
+ elif isinstance(content, list):
260
+ # Handle content blocks (less common for OpenAI)
261
+ new_content = []
262
+ for block in content:
263
+ if isinstance(block, dict) and block.get("type") == "text":
264
+ text = block.get("text", "")
265
+ result = self._detector.detect(text)
266
+ all_spans.extend(result.spans)
267
+ total_detection_time += result.processing_time_ms
268
+ warnings.extend(result.warnings)
269
+
270
+ stabilized = result.static_content
271
+
272
+ if effective_config.normalize_whitespace:
273
+ stabilized = self._normalize_whitespace(stabilized)
274
+
275
+ if result.dynamic_content:
276
+ dynamic_section = self._format_dynamic_section(
277
+ result.dynamic_content,
278
+ separator=effective_config.dynamic_separator,
279
+ )
280
+ stabilized = stabilized.rstrip() + dynamic_section
281
+
282
+ new_content.append({**block, "text": stabilized})
283
+ else:
284
+ new_content.append(block)
285
+
286
+ optimized_messages[i]["content"] = new_content
287
+ if all_spans:
288
+ transforms_applied.append("processed_content_blocks")
289
+
290
+ # Analyze prefix stability
291
+ analysis = self._analyze_prefix(optimized_messages, context)
292
+
293
+ # Calculate token estimates
294
+ tokens_before = self._estimate_total_tokens(messages)
295
+ tokens_after = self._estimate_total_tokens(optimized_messages)
296
+
297
+ # Build metrics
298
+ metrics = CacheMetrics(
299
+ stable_prefix_tokens=analysis.stable_tokens,
300
+ stable_prefix_hash=analysis.prefix_hash,
301
+ prefix_changed_from_previous=analysis.changed_from_previous,
302
+ previous_prefix_hash=analysis.previous_hash,
303
+ estimated_cache_hit=not analysis.changed_from_previous,
304
+ cacheable_tokens=self._calculate_cacheable_tokens(analysis.stable_tokens),
305
+ non_cacheable_tokens=max(0, tokens_after - analysis.stable_tokens),
306
+ estimated_savings_percent=self._calculate_savings_percent(
307
+ analysis.stable_tokens,
308
+ tokens_after,
309
+ likely_cache_hit=not analysis.changed_from_previous,
310
+ ),
311
+ )
312
+
313
+ # Add warnings for suboptimal cases
314
+ if tokens_after < self.MIN_TOKENS_FOR_CACHING:
315
+ warnings.append(
316
+ f"Prompt has ~{tokens_after} tokens, below OpenAI's {self.MIN_TOKENS_FOR_CACHING} "
317
+ f"token minimum for caching. Consider adding more static context."
318
+ )
319
+
320
+ if analysis.changed_from_previous:
321
+ warnings.append(
322
+ "Prefix changed from previous request - cache miss likely. "
323
+ "Consider reviewing what content is changing between requests."
324
+ )
325
+
326
+ # Record metrics and update state
327
+ self._record_metrics(metrics)
328
+ self._previous_prefix_hash = analysis.prefix_hash
329
+
330
+ return CacheResult(
331
+ messages=optimized_messages,
332
+ metrics=metrics,
333
+ tokens_before=tokens_before,
334
+ tokens_after=tokens_after,
335
+ transforms_applied=list(set(transforms_applied)), # Dedupe
336
+ warnings=warnings,
337
+ )
338
+
339
+ def estimate_savings(
340
+ self,
341
+ messages: list[dict[str, Any]],
342
+ context: OptimizationContext,
343
+ ) -> float:
344
+ """
345
+ Estimate potential cost savings from caching.
346
+
347
+ OpenAI provides 50% discount on cached tokens. This method estimates
348
+ what portion of tokens are likely to be cached based on prefix
349
+ stability and token count.
350
+
351
+ Args:
352
+ messages: Messages to analyze.
353
+ context: Optimization context.
354
+
355
+ Returns:
356
+ Estimated savings as a percentage (0-100).
357
+ Returns 0 if prompt is below caching threshold.
358
+
359
+ Example:
360
+ >>> savings = optimizer.estimate_savings(messages, context)
361
+ >>> print(f"Potential savings: {savings:.1f}%")
362
+ """
363
+ total_tokens = self._estimate_total_tokens(messages)
364
+
365
+ # No savings if below threshold
366
+ if total_tokens < self.MIN_TOKENS_FOR_CACHING:
367
+ return 0.0
368
+
369
+ # Extract system content for prefix analysis
370
+ system_content = self._extract_system_content(messages)
371
+ system_tokens = self._count_tokens_estimate(system_content)
372
+
373
+ # Estimate cacheable portion (system + early messages)
374
+ # OpenAI caches the longest matching prefix
375
+ cacheable_ratio = min(1.0, system_tokens / total_tokens)
376
+
377
+ # Check if prefix is stable
378
+ current_hash = self._compute_prefix_hash(system_content)
379
+ likely_hit = (
380
+ self._previous_prefix_hash is not None and current_hash == self._previous_prefix_hash
381
+ )
382
+
383
+ if likely_hit:
384
+ # 50% savings on cacheable portion
385
+ return cacheable_ratio * self.CACHE_DISCOUNT_PERCENT
386
+ else:
387
+ # First request or prefix changed - no immediate savings
388
+ # but return expected savings for future requests
389
+ return cacheable_ratio * self.CACHE_DISCOUNT_PERCENT * 0.5
390
+
391
+ def _normalize_whitespace(
392
+ self,
393
+ content: str,
394
+ collapse_blank_lines: bool = True,
395
+ ) -> str:
396
+ """
397
+ Normalize whitespace in content.
398
+
399
+ Ensures consistent whitespace formatting to improve prefix matching.
400
+ This helps when the same logical content has minor formatting differences.
401
+
402
+ Args:
403
+ content: Text to normalize.
404
+ collapse_blank_lines: If True, multiple blank lines become one.
405
+
406
+ Returns:
407
+ Content with normalized whitespace.
408
+ """
409
+ # Normalize line endings
410
+ result = content.replace("\r\n", "\n").replace("\r", "\n")
411
+
412
+ # Collapse multiple spaces (but preserve indentation)
413
+ lines = result.split("\n")
414
+ normalized_lines = []
415
+
416
+ for line in lines:
417
+ # Preserve leading whitespace, normalize trailing
418
+ stripped = line.rstrip()
419
+ if stripped:
420
+ # Find leading whitespace
421
+ leading = len(line) - len(line.lstrip())
422
+ # Collapse multiple spaces in content (not indentation)
423
+ content_part = " ".join(stripped.split())
424
+ normalized_lines.append(
425
+ " " * leading + content_part[leading:] if leading else content_part
426
+ )
427
+ else:
428
+ normalized_lines.append("")
429
+
430
+ result = "\n".join(normalized_lines)
431
+
432
+ # Collapse multiple blank lines
433
+ if collapse_blank_lines:
434
+ while "\n\n\n" in result:
435
+ result = result.replace("\n\n\n", "\n\n")
436
+
437
+ return result.strip()
438
+
439
+ def _format_dynamic_section(
440
+ self,
441
+ dynamic_content: str,
442
+ separator: str = "\n\n---\n\n",
443
+ ) -> str:
444
+ """
445
+ Format extracted dynamic content as a section to append.
446
+
447
+ Creates a clearly marked section containing dynamic values,
448
+ appended to the end of the message to preserve prefix stability.
449
+
450
+ Args:
451
+ dynamic_content: The dynamic content string to append.
452
+ separator: Separator to use before the dynamic section.
453
+
454
+ Returns:
455
+ Formatted dynamic section string.
456
+ """
457
+ if not dynamic_content or not dynamic_content.strip():
458
+ return ""
459
+
460
+ # Format as a context section
461
+ return f"{separator}[Current Context]\n{dynamic_content.strip()}\n"
462
+
463
+ def _analyze_prefix(
464
+ self,
465
+ messages: list[dict[str, Any]],
466
+ context: OptimizationContext,
467
+ ) -> PrefixAnalysis:
468
+ """
469
+ Analyze the prefix for stability metrics.
470
+
471
+ Computes hash of the stable prefix portion and compares with
472
+ previous requests to estimate cache hit likelihood.
473
+
474
+ Args:
475
+ messages: Messages to analyze.
476
+ context: Optimization context with previous hash.
477
+
478
+ Returns:
479
+ PrefixAnalysis with stability metrics.
480
+ """
481
+ # Extract prefix content (system messages + structure)
482
+ prefix_parts = []
483
+
484
+ for msg in messages:
485
+ if msg.get("role") == "system":
486
+ content = msg.get("content", "")
487
+ if isinstance(content, str):
488
+ prefix_parts.append(content)
489
+ elif isinstance(content, list):
490
+ for block in content:
491
+ if isinstance(block, dict) and block.get("type") == "text":
492
+ prefix_parts.append(block.get("text", ""))
493
+
494
+ prefix_content = "\n".join(prefix_parts)
495
+ prefix_hash = self._compute_prefix_hash(prefix_content)
496
+ stable_tokens = self._count_tokens_estimate(prefix_content)
497
+
498
+ # Check for changes from previous request
499
+ previous_hash = context.previous_prefix_hash or self._previous_prefix_hash
500
+ changed = previous_hash is not None and prefix_hash != previous_hash
501
+
502
+ return PrefixAnalysis(
503
+ prefix_hash=prefix_hash,
504
+ stable_tokens=stable_tokens,
505
+ changed_from_previous=changed,
506
+ previous_hash=previous_hash,
507
+ )
508
+
509
+ def _calculate_cacheable_tokens(self, stable_prefix_tokens: int) -> int:
510
+ """
511
+ Calculate how many tokens are likely cacheable.
512
+
513
+ OpenAI only caches prompts > 1024 tokens, and caches in chunks.
514
+
515
+ Args:
516
+ stable_prefix_tokens: Number of tokens in stable prefix.
517
+
518
+ Returns:
519
+ Estimated cacheable token count.
520
+ """
521
+ if stable_prefix_tokens < self.MIN_TOKENS_FOR_CACHING:
522
+ return 0
523
+
524
+ # OpenAI caches in 128-token chunks (aligned)
525
+ # Return the aligned cacheable amount
526
+ return (stable_prefix_tokens // 128) * 128
527
+
528
+ def _calculate_savings_percent(
529
+ self,
530
+ stable_tokens: int,
531
+ total_tokens: int,
532
+ likely_cache_hit: bool,
533
+ ) -> float:
534
+ """
535
+ Calculate estimated savings percentage.
536
+
537
+ Args:
538
+ stable_tokens: Tokens in stable prefix.
539
+ total_tokens: Total tokens in request.
540
+ likely_cache_hit: Whether a cache hit is likely.
541
+
542
+ Returns:
543
+ Estimated savings as percentage (0-100).
544
+ """
545
+ if total_tokens == 0:
546
+ return 0.0
547
+
548
+ cacheable = self._calculate_cacheable_tokens(stable_tokens)
549
+ if cacheable == 0:
550
+ return 0.0
551
+
552
+ cacheable_ratio = cacheable / total_tokens
553
+
554
+ if likely_cache_hit:
555
+ # Full 50% savings on cacheable portion
556
+ return cacheable_ratio * self.CACHE_DISCOUNT_PERCENT
557
+ else:
558
+ # No savings on first request, but show potential
559
+ return 0.0
560
+
561
+ def _estimate_total_tokens(self, messages: list[dict[str, Any]]) -> int:
562
+ """
563
+ Estimate total tokens in messages.
564
+
565
+ Args:
566
+ messages: Messages to count.
567
+
568
+ Returns:
569
+ Estimated token count.
570
+ """
571
+ total = 0
572
+ for msg in messages:
573
+ content = msg.get("content", "")
574
+ if isinstance(content, str):
575
+ total += self._count_tokens_estimate(content)
576
+ elif isinstance(content, list):
577
+ for block in content:
578
+ if isinstance(block, dict):
579
+ if block.get("type") == "text":
580
+ total += self._count_tokens_estimate(block.get("text", ""))
581
+ elif block.get("type") == "image_url":
582
+ # Rough estimate for images
583
+ total += 85 # Base cost
584
+ return total