headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cache Optimizer Registry.
|
|
3
|
+
|
|
4
|
+
Provides a plugin system for registering and retrieving cache optimizers.
|
|
5
|
+
This allows users to swap implementations and register custom optimizers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .base import BaseCacheOptimizer, CacheConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CacheOptimizerRegistry:
|
|
14
|
+
"""
|
|
15
|
+
Registry for cache optimizer plugins.
|
|
16
|
+
|
|
17
|
+
This registry allows:
|
|
18
|
+
- Registration of custom optimizers
|
|
19
|
+
- Retrieval by provider name
|
|
20
|
+
- Tier-based selection (oss vs enterprise)
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
# Get default optimizer for provider
|
|
24
|
+
optimizer = CacheOptimizerRegistry.get("anthropic")
|
|
25
|
+
|
|
26
|
+
# Get enterprise version if available
|
|
27
|
+
optimizer = CacheOptimizerRegistry.get("anthropic", tier="enterprise")
|
|
28
|
+
|
|
29
|
+
# Register custom optimizer
|
|
30
|
+
CacheOptimizerRegistry.register("my-provider", MyOptimizer)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
_optimizers: dict[str, type[BaseCacheOptimizer]] = {}
|
|
34
|
+
_instances: dict[str, BaseCacheOptimizer] = {}
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def register(
|
|
38
|
+
cls,
|
|
39
|
+
name: str,
|
|
40
|
+
optimizer_class: type[BaseCacheOptimizer],
|
|
41
|
+
*,
|
|
42
|
+
override: bool = False,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Register a cache optimizer.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
name: Name to register under (e.g., "anthropic", "anthropic-enterprise")
|
|
49
|
+
optimizer_class: The optimizer class to register
|
|
50
|
+
override: Whether to override existing registration
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ValueError: If name already registered and override=False
|
|
54
|
+
"""
|
|
55
|
+
if name in cls._optimizers and not override:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Optimizer '{name}' already registered. Use override=True to replace."
|
|
58
|
+
)
|
|
59
|
+
cls._optimizers[name] = optimizer_class
|
|
60
|
+
# Clear cached instance if exists
|
|
61
|
+
cls._instances.pop(name, None)
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def unregister(cls, name: str) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Unregister a cache optimizer.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
name: Name to unregister
|
|
70
|
+
"""
|
|
71
|
+
cls._optimizers.pop(name, None)
|
|
72
|
+
cls._instances.pop(name, None)
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def get(
|
|
76
|
+
cls,
|
|
77
|
+
provider: str,
|
|
78
|
+
tier: str = "oss",
|
|
79
|
+
config: CacheConfig | None = None,
|
|
80
|
+
*,
|
|
81
|
+
cached: bool = True,
|
|
82
|
+
) -> BaseCacheOptimizer:
|
|
83
|
+
"""
|
|
84
|
+
Get a cache optimizer for a provider.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
provider: Provider name (e.g., "anthropic", "openai", "google")
|
|
88
|
+
tier: Tier to get ("oss" or "enterprise")
|
|
89
|
+
config: Optional configuration
|
|
90
|
+
cached: Whether to return cached instance
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Cache optimizer instance
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
KeyError: If no optimizer registered for provider/tier
|
|
97
|
+
"""
|
|
98
|
+
# Build the lookup key
|
|
99
|
+
if tier != "oss":
|
|
100
|
+
key = f"{provider}-{tier}"
|
|
101
|
+
# Fall back to OSS if enterprise not available
|
|
102
|
+
if key not in cls._optimizers:
|
|
103
|
+
key = provider
|
|
104
|
+
else:
|
|
105
|
+
key = provider
|
|
106
|
+
|
|
107
|
+
if key not in cls._optimizers:
|
|
108
|
+
available = list(cls._optimizers.keys())
|
|
109
|
+
raise KeyError(f"No optimizer registered for '{key}'. Available: {available}")
|
|
110
|
+
|
|
111
|
+
# Return cached instance if requested
|
|
112
|
+
cache_key = f"{key}:{id(config)}" if config else key
|
|
113
|
+
if cached and cache_key in cls._instances:
|
|
114
|
+
return cls._instances[cache_key]
|
|
115
|
+
|
|
116
|
+
# Create new instance
|
|
117
|
+
optimizer_class = cls._optimizers[key]
|
|
118
|
+
instance = optimizer_class(config)
|
|
119
|
+
|
|
120
|
+
if cached:
|
|
121
|
+
cls._instances[cache_key] = instance
|
|
122
|
+
|
|
123
|
+
return instance
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def list_providers(cls) -> list[str]:
|
|
127
|
+
"""List all registered provider names (excluding tier suffixes)."""
|
|
128
|
+
providers = set()
|
|
129
|
+
for name in cls._optimizers:
|
|
130
|
+
# Remove tier suffix if present
|
|
131
|
+
base_name = name.split("-")[0]
|
|
132
|
+
providers.add(base_name)
|
|
133
|
+
return sorted(providers)
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def list_all(cls) -> list[str]:
|
|
137
|
+
"""List all registered optimizer names."""
|
|
138
|
+
return sorted(cls._optimizers.keys())
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def is_registered(cls, name: str) -> bool:
|
|
142
|
+
"""Check if an optimizer is registered."""
|
|
143
|
+
return name in cls._optimizers
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def clear(cls) -> None:
|
|
147
|
+
"""Clear all registrations. Mainly for testing."""
|
|
148
|
+
cls._optimizers.clear()
|
|
149
|
+
cls._instances.clear()
|
|
150
|
+
|
|
151
|
+
@classmethod
|
|
152
|
+
def reset_to_defaults(cls) -> None:
|
|
153
|
+
"""Reset to default registrations."""
|
|
154
|
+
cls.clear()
|
|
155
|
+
_register_defaults()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _register_defaults() -> None:
|
|
159
|
+
"""Register default optimizers."""
|
|
160
|
+
# Import here to avoid circular imports
|
|
161
|
+
from .anthropic import AnthropicCacheOptimizer
|
|
162
|
+
from .google import GoogleCacheOptimizer
|
|
163
|
+
from .openai import OpenAICacheOptimizer
|
|
164
|
+
|
|
165
|
+
CacheOptimizerRegistry.register("anthropic", AnthropicCacheOptimizer)
|
|
166
|
+
CacheOptimizerRegistry.register("openai", OpenAICacheOptimizer)
|
|
167
|
+
CacheOptimizerRegistry.register("google", GoogleCacheOptimizer)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Auto-register defaults on module import
|
|
171
|
+
# Wrapped in try/except to allow partial imports during development
|
|
172
|
+
try:
|
|
173
|
+
_register_defaults()
|
|
174
|
+
except ImportError:
|
|
175
|
+
pass
|
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Cache Layer.
|
|
3
|
+
|
|
4
|
+
Provides query-level semantic caching using embedding similarity.
|
|
5
|
+
This is COMPLEMENTARY to provider prompt caching - it caches complete
|
|
6
|
+
responses for semantically similar queries.
|
|
7
|
+
|
|
8
|
+
How it works:
|
|
9
|
+
1. When a query comes in, compute its embedding
|
|
10
|
+
2. Search for similar queries in the cache (cosine similarity)
|
|
11
|
+
3. If similarity > threshold, return cached response
|
|
12
|
+
4. Otherwise, proceed with normal optimization
|
|
13
|
+
|
|
14
|
+
Key difference from Prompt Caching:
|
|
15
|
+
- Prompt Caching: Provider caches KV-cache for prefix (same prompt = faster)
|
|
16
|
+
- Semantic Caching: We cache responses for similar queries (similar query = cached answer)
|
|
17
|
+
|
|
18
|
+
Usage:
|
|
19
|
+
from headroom.cache import SemanticCacheLayer, CacheOptimizerRegistry
|
|
20
|
+
|
|
21
|
+
# Get provider optimizer
|
|
22
|
+
provider_optimizer = CacheOptimizerRegistry.get("anthropic")
|
|
23
|
+
|
|
24
|
+
# Wrap with semantic layer
|
|
25
|
+
semantic = SemanticCacheLayer(
|
|
26
|
+
provider_optimizer,
|
|
27
|
+
similarity_threshold=0.95,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
result = semantic.process(messages, context)
|
|
31
|
+
if result.semantic_cache_hit:
|
|
32
|
+
# Use result.cached_response directly
|
|
33
|
+
pass
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import hashlib
|
|
39
|
+
import time
|
|
40
|
+
from collections import OrderedDict
|
|
41
|
+
from collections.abc import Callable
|
|
42
|
+
from dataclasses import dataclass
|
|
43
|
+
from typing import Any
|
|
44
|
+
|
|
45
|
+
from .base import (
|
|
46
|
+
BaseCacheOptimizer,
|
|
47
|
+
CacheConfig,
|
|
48
|
+
CacheMetrics,
|
|
49
|
+
CacheResult,
|
|
50
|
+
OptimizationContext,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class CacheEntry:
|
|
56
|
+
"""Entry in the semantic cache."""
|
|
57
|
+
|
|
58
|
+
# Query embedding
|
|
59
|
+
embedding: list[float]
|
|
60
|
+
|
|
61
|
+
# Original query text
|
|
62
|
+
query: str
|
|
63
|
+
|
|
64
|
+
# Cached response
|
|
65
|
+
response: Any
|
|
66
|
+
|
|
67
|
+
# Metadata
|
|
68
|
+
created_at: float
|
|
69
|
+
last_accessed: float
|
|
70
|
+
access_count: int = 1
|
|
71
|
+
|
|
72
|
+
# Hash of the full messages for exact matching
|
|
73
|
+
messages_hash: str = ""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class SemanticCacheConfig:
|
|
78
|
+
"""Configuration for semantic caching."""
|
|
79
|
+
|
|
80
|
+
# Similarity threshold for cache hit (0.0 - 1.0)
|
|
81
|
+
similarity_threshold: float = 0.95
|
|
82
|
+
|
|
83
|
+
# Maximum entries in cache
|
|
84
|
+
max_entries: int = 1000
|
|
85
|
+
|
|
86
|
+
# TTL in seconds (0 = no expiry)
|
|
87
|
+
ttl_seconds: int = 300
|
|
88
|
+
|
|
89
|
+
# Whether to use exact hash matching as fallback
|
|
90
|
+
use_exact_matching: bool = True
|
|
91
|
+
|
|
92
|
+
# Embedding model (if using embeddings)
|
|
93
|
+
embedding_model: str = "all-MiniLM-L6-v2"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class SemanticCache:
|
|
97
|
+
"""
|
|
98
|
+
In-memory semantic cache with LRU eviction.
|
|
99
|
+
|
|
100
|
+
Stores query embeddings and responses, supporting both
|
|
101
|
+
semantic similarity search and exact hash matching.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
config: SemanticCacheConfig | None = None,
|
|
107
|
+
embedding_fn: Callable[[str], list[float]] | None = None,
|
|
108
|
+
):
|
|
109
|
+
"""
|
|
110
|
+
Initialize the semantic cache.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
config: Cache configuration
|
|
114
|
+
embedding_fn: Optional custom embedding function.
|
|
115
|
+
If not provided, uses simple hash-based matching.
|
|
116
|
+
"""
|
|
117
|
+
self.config = config or SemanticCacheConfig()
|
|
118
|
+
self._embedding_fn = embedding_fn
|
|
119
|
+
|
|
120
|
+
# LRU cache: key -> CacheEntry
|
|
121
|
+
self._cache: OrderedDict[str, CacheEntry] = OrderedDict()
|
|
122
|
+
|
|
123
|
+
# Exact hash index: messages_hash -> key
|
|
124
|
+
self._hash_index: dict[str, str] = {}
|
|
125
|
+
|
|
126
|
+
# Statistics
|
|
127
|
+
self._hits = 0
|
|
128
|
+
self._misses = 0
|
|
129
|
+
self._evictions = 0
|
|
130
|
+
|
|
131
|
+
def get(
|
|
132
|
+
self,
|
|
133
|
+
query: str,
|
|
134
|
+
messages_hash: str | None = None,
|
|
135
|
+
) -> CacheEntry | None:
|
|
136
|
+
"""
|
|
137
|
+
Look up a cached entry.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
query: Query text to search for
|
|
141
|
+
messages_hash: Optional exact hash for fast lookup
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
CacheEntry if found, None otherwise
|
|
145
|
+
"""
|
|
146
|
+
self._cleanup_expired()
|
|
147
|
+
|
|
148
|
+
# Try exact hash match first
|
|
149
|
+
if messages_hash and self.config.use_exact_matching:
|
|
150
|
+
key = self._hash_index.get(messages_hash)
|
|
151
|
+
if key and key in self._cache:
|
|
152
|
+
entry = self._cache[key]
|
|
153
|
+
self._touch(key)
|
|
154
|
+
self._hits += 1
|
|
155
|
+
return entry
|
|
156
|
+
|
|
157
|
+
# Try semantic similarity if we have embedding function
|
|
158
|
+
if self._embedding_fn:
|
|
159
|
+
query_embedding = self._embedding_fn(query)
|
|
160
|
+
best_match, best_similarity = self._find_similar(query_embedding)
|
|
161
|
+
|
|
162
|
+
if best_similarity >= self.config.similarity_threshold:
|
|
163
|
+
self._touch(best_match)
|
|
164
|
+
self._hits += 1
|
|
165
|
+
return self._cache[best_match]
|
|
166
|
+
|
|
167
|
+
self._misses += 1
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
def put(
|
|
171
|
+
self,
|
|
172
|
+
query: str,
|
|
173
|
+
response: Any,
|
|
174
|
+
messages_hash: str | None = None,
|
|
175
|
+
) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Store a response in the cache.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
query: Query text
|
|
181
|
+
response: Response to cache
|
|
182
|
+
messages_hash: Optional exact hash for fast lookup
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Cache key for the entry
|
|
186
|
+
"""
|
|
187
|
+
self._cleanup_expired()
|
|
188
|
+
|
|
189
|
+
# Evict if at capacity
|
|
190
|
+
while len(self._cache) >= self.config.max_entries:
|
|
191
|
+
self._evict_oldest()
|
|
192
|
+
|
|
193
|
+
# Generate embedding if available
|
|
194
|
+
embedding: list[float] = []
|
|
195
|
+
if self._embedding_fn:
|
|
196
|
+
embedding = self._embedding_fn(query)
|
|
197
|
+
|
|
198
|
+
# Create cache key
|
|
199
|
+
key = self._generate_key(query)
|
|
200
|
+
|
|
201
|
+
now = time.time()
|
|
202
|
+
entry = CacheEntry(
|
|
203
|
+
embedding=embedding,
|
|
204
|
+
query=query,
|
|
205
|
+
response=response,
|
|
206
|
+
created_at=now,
|
|
207
|
+
last_accessed=now,
|
|
208
|
+
messages_hash=messages_hash or "",
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
self._cache[key] = entry
|
|
212
|
+
|
|
213
|
+
# Index by hash for fast exact matching
|
|
214
|
+
if messages_hash:
|
|
215
|
+
self._hash_index[messages_hash] = key
|
|
216
|
+
|
|
217
|
+
return key
|
|
218
|
+
|
|
219
|
+
def invalidate(self, key: str) -> bool:
|
|
220
|
+
"""Invalidate a cache entry by key."""
|
|
221
|
+
if key in self._cache:
|
|
222
|
+
entry = self._cache.pop(key)
|
|
223
|
+
if entry.messages_hash:
|
|
224
|
+
self._hash_index.pop(entry.messages_hash, None)
|
|
225
|
+
return True
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
def clear(self) -> None:
|
|
229
|
+
"""Clear all cache entries."""
|
|
230
|
+
self._cache.clear()
|
|
231
|
+
self._hash_index.clear()
|
|
232
|
+
|
|
233
|
+
def get_stats(self) -> dict[str, Any]:
|
|
234
|
+
"""Get cache statistics."""
|
|
235
|
+
total = self._hits + self._misses
|
|
236
|
+
hit_rate = self._hits / total if total > 0 else 0.0
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
"entries": len(self._cache),
|
|
240
|
+
"max_entries": self.config.max_entries,
|
|
241
|
+
"hits": self._hits,
|
|
242
|
+
"misses": self._misses,
|
|
243
|
+
"hit_rate": hit_rate,
|
|
244
|
+
"evictions": self._evictions,
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
def _find_similar(
|
|
248
|
+
self,
|
|
249
|
+
query_embedding: list[float],
|
|
250
|
+
) -> tuple[str, float]:
|
|
251
|
+
"""Find the most similar cached entry."""
|
|
252
|
+
best_key = ""
|
|
253
|
+
best_similarity = -1.0
|
|
254
|
+
|
|
255
|
+
for key, entry in self._cache.items():
|
|
256
|
+
if not entry.embedding:
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
similarity = self._cosine_similarity(query_embedding, entry.embedding)
|
|
260
|
+
if similarity > best_similarity:
|
|
261
|
+
best_similarity = similarity
|
|
262
|
+
best_key = key
|
|
263
|
+
|
|
264
|
+
return best_key, best_similarity
|
|
265
|
+
|
|
266
|
+
def _cosine_similarity(
|
|
267
|
+
self,
|
|
268
|
+
a: list[float],
|
|
269
|
+
b: list[float],
|
|
270
|
+
) -> float:
|
|
271
|
+
"""Compute cosine similarity between two vectors."""
|
|
272
|
+
if len(a) != len(b) or not a:
|
|
273
|
+
return 0.0
|
|
274
|
+
|
|
275
|
+
dot_product = sum(x * y for x, y in zip(a, b))
|
|
276
|
+
norm_a = sum(x * x for x in a) ** 0.5
|
|
277
|
+
norm_b = sum(x * x for x in b) ** 0.5
|
|
278
|
+
|
|
279
|
+
if norm_a == 0 or norm_b == 0:
|
|
280
|
+
return 0.0
|
|
281
|
+
|
|
282
|
+
return float(dot_product / (norm_a * norm_b))
|
|
283
|
+
|
|
284
|
+
def _touch(self, key: str) -> None:
|
|
285
|
+
"""Update access time and move to end of LRU."""
|
|
286
|
+
if key in self._cache:
|
|
287
|
+
entry = self._cache.pop(key)
|
|
288
|
+
entry.last_accessed = time.time()
|
|
289
|
+
entry.access_count += 1
|
|
290
|
+
self._cache[key] = entry
|
|
291
|
+
|
|
292
|
+
def _evict_oldest(self) -> None:
|
|
293
|
+
"""Evict the oldest (least recently used) entry."""
|
|
294
|
+
if self._cache:
|
|
295
|
+
key, entry = self._cache.popitem(last=False)
|
|
296
|
+
if entry.messages_hash:
|
|
297
|
+
self._hash_index.pop(entry.messages_hash, None)
|
|
298
|
+
self._evictions += 1
|
|
299
|
+
|
|
300
|
+
def _cleanup_expired(self) -> None:
|
|
301
|
+
"""Remove expired entries."""
|
|
302
|
+
if self.config.ttl_seconds <= 0:
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
now = time.time()
|
|
306
|
+
expired = [
|
|
307
|
+
key
|
|
308
|
+
for key, entry in self._cache.items()
|
|
309
|
+
if now - entry.created_at > self.config.ttl_seconds
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
for key in expired:
|
|
313
|
+
entry = self._cache.pop(key)
|
|
314
|
+
if entry.messages_hash:
|
|
315
|
+
self._hash_index.pop(entry.messages_hash, None)
|
|
316
|
+
|
|
317
|
+
def _generate_key(self, query: str) -> str:
|
|
318
|
+
"""Generate a cache key for a query."""
|
|
319
|
+
return hashlib.sha256(query.encode()).hexdigest()[:16]
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class SemanticCacheLayer:
|
|
323
|
+
"""
|
|
324
|
+
Layer that adds semantic caching on top of provider optimizers.
|
|
325
|
+
|
|
326
|
+
This layer checks for semantically similar queries before
|
|
327
|
+
delegating to the underlying provider optimizer.
|
|
328
|
+
"""
|
|
329
|
+
|
|
330
|
+
def __init__(
|
|
331
|
+
self,
|
|
332
|
+
provider_optimizer: BaseCacheOptimizer,
|
|
333
|
+
similarity_threshold: float = 0.95,
|
|
334
|
+
max_entries: int = 1000,
|
|
335
|
+
ttl_seconds: int = 300,
|
|
336
|
+
embedding_fn: Callable[[str], list[float]] | None = None,
|
|
337
|
+
):
|
|
338
|
+
"""
|
|
339
|
+
Initialize the semantic cache layer.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
provider_optimizer: Underlying provider optimizer
|
|
343
|
+
similarity_threshold: Similarity threshold for cache hits
|
|
344
|
+
max_entries: Maximum cache entries
|
|
345
|
+
ttl_seconds: Cache TTL in seconds
|
|
346
|
+
embedding_fn: Optional embedding function
|
|
347
|
+
"""
|
|
348
|
+
self.provider_optimizer = provider_optimizer
|
|
349
|
+
|
|
350
|
+
cache_config = SemanticCacheConfig(
|
|
351
|
+
similarity_threshold=similarity_threshold,
|
|
352
|
+
max_entries=max_entries,
|
|
353
|
+
ttl_seconds=ttl_seconds,
|
|
354
|
+
)
|
|
355
|
+
self.cache = SemanticCache(cache_config, embedding_fn)
|
|
356
|
+
|
|
357
|
+
def process(
|
|
358
|
+
self,
|
|
359
|
+
messages: list[dict[str, Any]],
|
|
360
|
+
context: OptimizationContext,
|
|
361
|
+
config: CacheConfig | None = None,
|
|
362
|
+
) -> CacheResult:
|
|
363
|
+
"""
|
|
364
|
+
Process messages through semantic cache and provider optimizer.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
messages: Messages to process
|
|
368
|
+
context: Optimization context
|
|
369
|
+
config: Optional configuration override
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
CacheResult with semantic_cache_hit=True if cache hit
|
|
373
|
+
"""
|
|
374
|
+
# Extract query for semantic matching
|
|
375
|
+
query = context.query or self._extract_query(messages)
|
|
376
|
+
messages_hash = self._compute_messages_hash(messages)
|
|
377
|
+
|
|
378
|
+
# Check semantic cache
|
|
379
|
+
cached = self.cache.get(query, messages_hash)
|
|
380
|
+
if cached:
|
|
381
|
+
return CacheResult(
|
|
382
|
+
messages=messages,
|
|
383
|
+
semantic_cache_hit=True,
|
|
384
|
+
cached_response=cached.response,
|
|
385
|
+
metrics=CacheMetrics(
|
|
386
|
+
estimated_cache_hit=True,
|
|
387
|
+
estimated_savings_percent=100.0,
|
|
388
|
+
),
|
|
389
|
+
transforms_applied=["semantic_cache_hit"],
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Delegate to provider optimizer
|
|
393
|
+
result = self.provider_optimizer.optimize(messages, context, config)
|
|
394
|
+
|
|
395
|
+
return result
|
|
396
|
+
|
|
397
|
+
def store_response(
|
|
398
|
+
self,
|
|
399
|
+
messages: list[dict[str, Any]],
|
|
400
|
+
response: Any,
|
|
401
|
+
context: OptimizationContext | None = None,
|
|
402
|
+
) -> str:
|
|
403
|
+
"""
|
|
404
|
+
Store a response in the semantic cache.
|
|
405
|
+
|
|
406
|
+
Call this after receiving a response from the LLM to enable
|
|
407
|
+
future cache hits.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
messages: Original messages
|
|
411
|
+
response: Response from LLM
|
|
412
|
+
context: Optional context with query
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
Cache key
|
|
416
|
+
"""
|
|
417
|
+
query = (context.query if context else None) or self._extract_query(messages)
|
|
418
|
+
messages_hash = self._compute_messages_hash(messages)
|
|
419
|
+
|
|
420
|
+
return self.cache.put(query, response, messages_hash)
|
|
421
|
+
|
|
422
|
+
def get_stats(self) -> dict[str, Any]:
|
|
423
|
+
"""Get combined statistics."""
|
|
424
|
+
return {
|
|
425
|
+
"semantic_cache": self.cache.get_stats(),
|
|
426
|
+
"provider_optimizer": self.provider_optimizer.name,
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
def _extract_query(self, messages: list[dict[str, Any]]) -> str:
|
|
430
|
+
"""Extract the last user query from messages."""
|
|
431
|
+
for msg in reversed(messages):
|
|
432
|
+
if msg.get("role") == "user":
|
|
433
|
+
content = msg.get("content", "")
|
|
434
|
+
if isinstance(content, str):
|
|
435
|
+
return content
|
|
436
|
+
elif isinstance(content, list):
|
|
437
|
+
for block in content:
|
|
438
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
439
|
+
text_val = block.get("text", "")
|
|
440
|
+
return str(text_val) if text_val else ""
|
|
441
|
+
return ""
|
|
442
|
+
|
|
443
|
+
def _compute_messages_hash(self, messages: list[dict[str, Any]]) -> str:
|
|
444
|
+
"""Compute a hash of all messages."""
|
|
445
|
+
import json
|
|
446
|
+
|
|
447
|
+
try:
|
|
448
|
+
content = json.dumps(messages, sort_keys=True)
|
|
449
|
+
return hashlib.sha256(content.encode()).hexdigest()[:24]
|
|
450
|
+
except (TypeError, ValueError):
|
|
451
|
+
return ""
|
headroom/ccr/__init__.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""CCR (Compress-Cache-Retrieve) module for reversible compression.
|
|
2
|
+
|
|
3
|
+
This module provides tool injection and retrieval handling for the CCR architecture.
|
|
4
|
+
When tool outputs are compressed, the LLM can retrieve more data if needed.
|
|
5
|
+
|
|
6
|
+
Three key components:
|
|
7
|
+
1. Tool Injection: Proxy injects headroom_retrieve tool into requests
|
|
8
|
+
2. Response Handler: Intercepts responses, handles CCR tool calls automatically
|
|
9
|
+
3. Context Tracker: Tracks compressed content across turns, enables proactive expansion
|
|
10
|
+
|
|
11
|
+
Two distribution channels for the retrieval tool:
|
|
12
|
+
1. Tool Injection: Proxy injects tool into request when compression occurs
|
|
13
|
+
2. MCP Server: Standalone server exposes tool via MCP protocol
|
|
14
|
+
|
|
15
|
+
When MCP is configured, tool injection is skipped to avoid duplicates.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from .context_tracker import (
|
|
19
|
+
CompressedContext,
|
|
20
|
+
ContextTracker,
|
|
21
|
+
ContextTrackerConfig,
|
|
22
|
+
ExpansionRecommendation,
|
|
23
|
+
get_context_tracker,
|
|
24
|
+
reset_context_tracker,
|
|
25
|
+
)
|
|
26
|
+
from .response_handler import (
|
|
27
|
+
CCRResponseHandler,
|
|
28
|
+
CCRToolCall,
|
|
29
|
+
CCRToolResult,
|
|
30
|
+
ResponseHandlerConfig,
|
|
31
|
+
StreamingCCRBuffer,
|
|
32
|
+
StreamingCCRHandler,
|
|
33
|
+
)
|
|
34
|
+
from .tool_injection import (
|
|
35
|
+
CCR_TOOL_NAME,
|
|
36
|
+
CCRToolInjector,
|
|
37
|
+
create_ccr_tool_definition,
|
|
38
|
+
create_system_instructions,
|
|
39
|
+
parse_tool_call,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# MCP server is optional (requires mcp package)
|
|
43
|
+
try:
|
|
44
|
+
from .mcp_server import CCRMCPServer, create_ccr_mcp_server
|
|
45
|
+
|
|
46
|
+
MCP_SERVER_AVAILABLE = True
|
|
47
|
+
except ImportError:
|
|
48
|
+
CCRMCPServer = None # type: ignore
|
|
49
|
+
create_ccr_mcp_server = None # type: ignore
|
|
50
|
+
MCP_SERVER_AVAILABLE = False
|
|
51
|
+
|
|
52
|
+
__all__ = [
|
|
53
|
+
# Tool injection
|
|
54
|
+
"CCR_TOOL_NAME",
|
|
55
|
+
"CCRToolInjector",
|
|
56
|
+
"create_ccr_tool_definition",
|
|
57
|
+
"create_system_instructions",
|
|
58
|
+
"parse_tool_call",
|
|
59
|
+
# Response handling
|
|
60
|
+
"CCRResponseHandler",
|
|
61
|
+
"CCRToolCall",
|
|
62
|
+
"CCRToolResult",
|
|
63
|
+
"ResponseHandlerConfig",
|
|
64
|
+
"StreamingCCRBuffer",
|
|
65
|
+
"StreamingCCRHandler",
|
|
66
|
+
# Context tracking
|
|
67
|
+
"CompressedContext",
|
|
68
|
+
"ContextTracker",
|
|
69
|
+
"ContextTrackerConfig",
|
|
70
|
+
"ExpansionRecommendation",
|
|
71
|
+
"get_context_tracker",
|
|
72
|
+
"reset_context_tracker",
|
|
73
|
+
# MCP server
|
|
74
|
+
"CCRMCPServer",
|
|
75
|
+
"create_ccr_mcp_server",
|
|
76
|
+
"MCP_SERVER_AVAILABLE",
|
|
77
|
+
]
|