headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
headroom/config.py
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
"""Configuration models for Headroom SDK."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HeadroomMode(str, Enum):
|
|
12
|
+
"""Operating modes for Headroom."""
|
|
13
|
+
|
|
14
|
+
AUDIT = "audit" # Observe only, no modifications
|
|
15
|
+
OPTIMIZE = "optimize" # Apply deterministic transforms
|
|
16
|
+
SIMULATE = "simulate" # Return transform plan without API call
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Model context limits should be provided by the Provider
|
|
20
|
+
# This dict allows user overrides only
|
|
21
|
+
DEFAULT_MODEL_CONTEXT_LIMITS: dict[str, int] = {}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ToolCrusherConfig:
|
|
26
|
+
"""Configuration for tool output compression (naive/fixed-rule approach).
|
|
27
|
+
|
|
28
|
+
GOTCHAS:
|
|
29
|
+
- Keeps FIRST N items only - may miss important data later in arrays
|
|
30
|
+
- A spike at index 50 will be lost if max_array_items=10
|
|
31
|
+
- String truncation cuts at fixed length, may break mid-word/mid-sentence
|
|
32
|
+
- No awareness of data patterns or importance
|
|
33
|
+
|
|
34
|
+
Consider using SmartCrusherConfig instead for statistical analysis.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
enabled: bool = False # Disabled by default, SmartCrusher is preferred
|
|
38
|
+
min_tokens_to_crush: int = 500 # Only crush if > N tokens
|
|
39
|
+
max_array_items: int = 10 # Keep first N items
|
|
40
|
+
max_string_length: int = 1000 # Truncate strings > N chars
|
|
41
|
+
max_depth: int = 5 # Preserve structure to depth N
|
|
42
|
+
preserve_keys: set[str] = field(
|
|
43
|
+
default_factory=lambda: {"error", "status", "code", "id", "message", "name", "type"}
|
|
44
|
+
)
|
|
45
|
+
tool_profiles: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class CacheAlignerConfig:
|
|
50
|
+
"""Configuration for cache alignment.
|
|
51
|
+
|
|
52
|
+
GOTCHAS:
|
|
53
|
+
- Date regex may match non-date content (e.g., version numbers like "2024-01-15")
|
|
54
|
+
- Moving dates to end of system prompt may confuse models if date was
|
|
55
|
+
semantically important in its original position
|
|
56
|
+
- Whitespace normalization may break:
|
|
57
|
+
- Code blocks with significant indentation
|
|
58
|
+
- ASCII art or formatted tables
|
|
59
|
+
- Markdown that relies on specific spacing
|
|
60
|
+
- ISO timestamps in tool outputs may be incorrectly flagged as "dynamic dates"
|
|
61
|
+
|
|
62
|
+
SAFE: Only applied to SYSTEM messages, not user/assistant/tool content.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
enabled: bool = True
|
|
66
|
+
date_patterns: list[str] = field(
|
|
67
|
+
default_factory=lambda: [
|
|
68
|
+
r"Current [Dd]ate:?\s*\d{4}-\d{2}-\d{2}",
|
|
69
|
+
r"Today is \w+,?\s+\w+ \d+",
|
|
70
|
+
r"Today's date:?\s*\d{4}-\d{2}-\d{2}",
|
|
71
|
+
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}",
|
|
72
|
+
]
|
|
73
|
+
)
|
|
74
|
+
normalize_whitespace: bool = True
|
|
75
|
+
collapse_blank_lines: bool = True
|
|
76
|
+
# Separator used to mark where dynamic content begins in system message
|
|
77
|
+
# Content before this separator is cached; content after is dynamic
|
|
78
|
+
dynamic_tail_separator: str = "\n\n---\n[Dynamic Context]\n"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class RollingWindowConfig:
|
|
83
|
+
"""Configuration for rolling window token cap.
|
|
84
|
+
|
|
85
|
+
GOTCHAS:
|
|
86
|
+
- Dropping old turns loses context the model may need:
|
|
87
|
+
- "As I mentioned earlier..." - what was mentioned is now gone
|
|
88
|
+
- "The user asked about X" - that turn may be dropped
|
|
89
|
+
- Implicit references to prior conversation become orphaned
|
|
90
|
+
- Tool call/result pairs are kept atomic (correct), BUT:
|
|
91
|
+
- Assistant text referencing a dropped tool result becomes confusing
|
|
92
|
+
- "Based on the search results..." when those results are gone
|
|
93
|
+
- keep_last_turns=2 may not be enough for complex multi-step reasoning
|
|
94
|
+
- No semantic analysis - drops oldest first regardless of importance
|
|
95
|
+
|
|
96
|
+
SAFER ALTERNATIVES:
|
|
97
|
+
- Increase keep_last_turns for agentic workloads
|
|
98
|
+
- Use summarization for old context (not implemented - would add latency)
|
|
99
|
+
- Set enabled=False for short conversations
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
enabled: bool = True
|
|
103
|
+
keep_system: bool = True # Never drop system prompt
|
|
104
|
+
keep_last_turns: int = 2 # Never drop last N turns
|
|
105
|
+
output_buffer_tokens: int = 4000 # Reserve for output
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class RelevanceScorerConfig:
|
|
110
|
+
"""Configuration for relevance scoring in SmartCrusher.
|
|
111
|
+
|
|
112
|
+
Relevance scoring determines which items to keep when compressing
|
|
113
|
+
tool outputs. Uses the pattern: relevance(item, context) -> [0, 1].
|
|
114
|
+
|
|
115
|
+
Available tiers:
|
|
116
|
+
- "bm25": BM25 keyword matching (zero dependencies, fast)
|
|
117
|
+
- "embedding": Semantic similarity via sentence-transformers
|
|
118
|
+
- "hybrid": BM25 + embedding with adaptive fusion (RECOMMENDED)
|
|
119
|
+
|
|
120
|
+
DEFAULT: "hybrid" - combines exact matching (UUIDs, IDs) with semantic
|
|
121
|
+
understanding. Falls back to BM25 if sentence-transformers not installed.
|
|
122
|
+
|
|
123
|
+
For full hybrid support, install: pip install headroom[relevance]
|
|
124
|
+
|
|
125
|
+
WHY HYBRID IS DEFAULT:
|
|
126
|
+
- Missing important items during compression is catastrophic
|
|
127
|
+
- BM25 alone gives low scores for single-term matches (e.g., "Alice" = 0.07)
|
|
128
|
+
- Semantic matching catches "errors" -> "failed", "issues", etc.
|
|
129
|
+
- 5-10ms latency is acceptable vs. losing critical data
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
tier: Literal["bm25", "embedding", "hybrid"] = "hybrid"
|
|
133
|
+
|
|
134
|
+
# BM25 parameters
|
|
135
|
+
bm25_k1: float = 1.5 # Term frequency saturation
|
|
136
|
+
bm25_b: float = 0.75 # Length normalization
|
|
137
|
+
|
|
138
|
+
# Embedding parameters
|
|
139
|
+
embedding_model: str = "all-MiniLM-L6-v2" # Lightweight model
|
|
140
|
+
|
|
141
|
+
# Hybrid parameters
|
|
142
|
+
hybrid_alpha: float = 0.5 # BM25 weight (1-alpha = embedding weight)
|
|
143
|
+
adaptive_alpha: bool = True # Adjust alpha based on query type
|
|
144
|
+
|
|
145
|
+
# Scoring thresholds
|
|
146
|
+
# With hybrid/embedding: semantic scores are meaningful (0.3-0.5 for good matches)
|
|
147
|
+
# With BM25 fallback: threshold is still reasonable for multi-term matches
|
|
148
|
+
# Lower threshold = safer (keeps more items), higher = more aggressive compression
|
|
149
|
+
relevance_threshold: float = 0.25 # Keep items above this score
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass
|
|
153
|
+
class SmartCrusherConfig:
|
|
154
|
+
"""Configuration for smart statistical crusher (DEFAULT).
|
|
155
|
+
|
|
156
|
+
Uses statistical analysis to intelligently compress tool outputs while
|
|
157
|
+
PRESERVING THE ORIGINAL JSON SCHEMA. Output contains only items from
|
|
158
|
+
the original array - no wrappers, no generated text, no metadata.
|
|
159
|
+
|
|
160
|
+
Safe V1 Compression Recipe - Always keeps:
|
|
161
|
+
- First K items (default 3)
|
|
162
|
+
- Last K items (default 2)
|
|
163
|
+
- Error items (containing 'error', 'exception', 'failed', 'critical')
|
|
164
|
+
- Anomalous numeric items (> 2 std from mean)
|
|
165
|
+
- Top-K by score if score field present
|
|
166
|
+
- Items matching query context via RelevanceScorer
|
|
167
|
+
|
|
168
|
+
GOTCHAS:
|
|
169
|
+
- Adds ~5-10ms overhead per tool output for statistical analysis
|
|
170
|
+
- Change point detection uses fixed window (5 items) - may miss:
|
|
171
|
+
- Very gradual changes
|
|
172
|
+
- Patterns in smaller arrays
|
|
173
|
+
- TOP_N for search results assumes higher score = more relevant
|
|
174
|
+
(may not be true for all APIs)
|
|
175
|
+
|
|
176
|
+
SAFER SETTINGS:
|
|
177
|
+
- Increase max_items_after_crush for critical data
|
|
178
|
+
- Set variance_threshold lower (1.5) to catch more change points
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
enabled: bool = True # Enabled by default (preferred over ToolCrusher)
|
|
182
|
+
min_items_to_analyze: int = 5 # Don't analyze tiny arrays
|
|
183
|
+
min_tokens_to_crush: int = 200 # Only crush if > N tokens
|
|
184
|
+
variance_threshold: float = 2.0 # Std devs for change point detection
|
|
185
|
+
uniqueness_threshold: float = 0.1 # Below this = nearly constant
|
|
186
|
+
similarity_threshold: float = 0.8 # For clustering similar strings
|
|
187
|
+
max_items_after_crush: int = 15 # Target max items in output
|
|
188
|
+
preserve_change_points: bool = True
|
|
189
|
+
factor_out_constants: bool = False # Disabled - preserves original schema
|
|
190
|
+
include_summaries: bool = False # Disabled - no generated text
|
|
191
|
+
|
|
192
|
+
# Feedback loop integration (TOIN - Tool Output Intelligence Network)
|
|
193
|
+
use_feedback_hints: bool = True # Use learned patterns to adjust compression
|
|
194
|
+
|
|
195
|
+
# LOW FIX #21: Make TOIN confidence threshold configurable
|
|
196
|
+
# Minimum confidence required to apply TOIN recommendations
|
|
197
|
+
toin_confidence_threshold: float = 0.5
|
|
198
|
+
|
|
199
|
+
# Relevance scoring configuration
|
|
200
|
+
relevance: RelevanceScorerConfig = field(default_factory=RelevanceScorerConfig)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@dataclass
|
|
204
|
+
class CacheOptimizerConfig:
|
|
205
|
+
"""Configuration for provider-specific cache optimization.
|
|
206
|
+
|
|
207
|
+
The CacheOptimizer system provides provider-specific caching strategies:
|
|
208
|
+
- Anthropic: Explicit cache_control breakpoints for prompt caching
|
|
209
|
+
- OpenAI: Prefix stabilization for automatic prefix caching
|
|
210
|
+
- Google: CachedContent API lifecycle management
|
|
211
|
+
|
|
212
|
+
This is COMPLEMENTARY to the CacheAligner transform - CacheAligner does
|
|
213
|
+
basic prefix stabilization (date extraction, whitespace normalization),
|
|
214
|
+
while CacheOptimizer applies provider-specific optimizations.
|
|
215
|
+
|
|
216
|
+
Enable this for maximum cache hit rates when you know your provider.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
enabled: bool = True # Enable provider-specific cache optimization
|
|
220
|
+
auto_detect_provider: bool = True # Auto-detect from HeadroomClient provider
|
|
221
|
+
min_cacheable_tokens: int = 1024 # Minimum tokens for caching (provider may override)
|
|
222
|
+
enable_semantic_cache: bool = False # Enable query-level semantic caching
|
|
223
|
+
semantic_cache_similarity: float = 0.95 # Similarity threshold for semantic cache
|
|
224
|
+
semantic_cache_max_entries: int = 1000 # Max semantic cache entries
|
|
225
|
+
semantic_cache_ttl_seconds: int = 300 # Semantic cache TTL
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@dataclass
|
|
229
|
+
class CCRConfig:
|
|
230
|
+
"""Configuration for Compress-Cache-Retrieve architecture.
|
|
231
|
+
|
|
232
|
+
CCR makes compression REVERSIBLE: when SmartCrusher compresses tool outputs,
|
|
233
|
+
the original data is cached. If the LLM needs more data, it can retrieve it.
|
|
234
|
+
|
|
235
|
+
Key insight from research: REVERSIBLE compression beats irreversible compression.
|
|
236
|
+
- Phil Schmid: "Prefer raw > Compaction > Summarization"
|
|
237
|
+
- Factory.ai: "Cutting context too aggressively can backfire"
|
|
238
|
+
|
|
239
|
+
How CCR works:
|
|
240
|
+
1. COMPRESS: SmartCrusher compresses array from 1000 to 20 items
|
|
241
|
+
2. CACHE: Original 1000 items stored in CompressionStore
|
|
242
|
+
3. INJECT: Marker added to tell LLM how to retrieve more
|
|
243
|
+
4. RETRIEVE: If LLM needs more, it calls headroom_retrieve(hash, query)
|
|
244
|
+
|
|
245
|
+
Benefits:
|
|
246
|
+
- Zero-risk compression: worst case = LLM retrieves what it needs
|
|
247
|
+
- Feedback loop: track what gets retrieved to improve compression
|
|
248
|
+
- Network effect: retrieval patterns improve compression for all users
|
|
249
|
+
|
|
250
|
+
GOTCHAS:
|
|
251
|
+
- Cache has TTL (default 5 min) - retrieval fails after expiration
|
|
252
|
+
- Memory usage: ~1KB per cached entry
|
|
253
|
+
- Only works with array compression (not string truncation)
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
enabled: bool = True # Enable CCR (cache + retrieval markers)
|
|
257
|
+
store_max_entries: int = 1000 # Max entries in compression store
|
|
258
|
+
store_ttl_seconds: int = 300 # Cache TTL (5 minutes)
|
|
259
|
+
inject_retrieval_marker: bool = True # Add retrieval hint to compressed output
|
|
260
|
+
feedback_enabled: bool = True # Track retrieval events for learning
|
|
261
|
+
min_items_to_cache: int = 20 # Only cache if original had >= N items
|
|
262
|
+
|
|
263
|
+
# Tool injection (Phase 3)
|
|
264
|
+
inject_tool: bool = True # Inject headroom_retrieve tool into tools array
|
|
265
|
+
inject_system_instructions: bool = False # Add retrieval instructions to system message
|
|
266
|
+
|
|
267
|
+
# Retrieval marker format
|
|
268
|
+
# Inserted at end of compressed content to tell LLM how to get more
|
|
269
|
+
marker_template: str = (
|
|
270
|
+
"\n[{original_count} items compressed to {compressed_count}. Retrieve more: hash={hash}]"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
@dataclass
|
|
275
|
+
class HeadroomConfig:
|
|
276
|
+
"""Main configuration for HeadroomClient."""
|
|
277
|
+
|
|
278
|
+
store_url: str = "sqlite:///headroom.db"
|
|
279
|
+
default_mode: HeadroomMode = HeadroomMode.AUDIT
|
|
280
|
+
model_context_limits: dict[str, int] = field(
|
|
281
|
+
default_factory=lambda: DEFAULT_MODEL_CONTEXT_LIMITS.copy()
|
|
282
|
+
)
|
|
283
|
+
tool_crusher: ToolCrusherConfig = field(default_factory=ToolCrusherConfig)
|
|
284
|
+
smart_crusher: SmartCrusherConfig = field(default_factory=SmartCrusherConfig)
|
|
285
|
+
cache_aligner: CacheAlignerConfig = field(default_factory=CacheAlignerConfig)
|
|
286
|
+
rolling_window: RollingWindowConfig = field(default_factory=RollingWindowConfig)
|
|
287
|
+
cache_optimizer: CacheOptimizerConfig = field(default_factory=CacheOptimizerConfig)
|
|
288
|
+
ccr: CCRConfig = field(default_factory=CCRConfig) # Compress-Cache-Retrieve
|
|
289
|
+
|
|
290
|
+
# Debugging - opt-in diff artifact generation
|
|
291
|
+
generate_diff_artifact: bool = False # Enable to get detailed transform diffs
|
|
292
|
+
|
|
293
|
+
def get_context_limit(self, model: str) -> int | None:
|
|
294
|
+
"""
|
|
295
|
+
Get context limit for a model from user overrides.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
model: Model name.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Context limit if configured, None otherwise.
|
|
302
|
+
Provider should be consulted if None is returned.
|
|
303
|
+
"""
|
|
304
|
+
if model in self.model_context_limits:
|
|
305
|
+
return self.model_context_limits[model]
|
|
306
|
+
# Try prefix matching for versioned model names
|
|
307
|
+
for known_model, limit in self.model_context_limits.items():
|
|
308
|
+
if model.startswith(known_model):
|
|
309
|
+
return limit
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
@dataclass
|
|
314
|
+
class Block:
|
|
315
|
+
"""Atomic unit of context analysis."""
|
|
316
|
+
|
|
317
|
+
kind: Literal["system", "user", "assistant", "tool_call", "tool_result", "rag", "unknown"]
|
|
318
|
+
text: str
|
|
319
|
+
tokens_est: int
|
|
320
|
+
content_hash: str
|
|
321
|
+
source_index: int # Position in original messages
|
|
322
|
+
flags: dict[str, Any] = field(default_factory=dict)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
@dataclass
|
|
326
|
+
class WasteSignals:
|
|
327
|
+
"""Detected waste signals in a request."""
|
|
328
|
+
|
|
329
|
+
json_bloat_tokens: int = 0 # JSON blocks > 500 tokens
|
|
330
|
+
html_noise_tokens: int = 0 # HTML tags/comments
|
|
331
|
+
base64_tokens: int = 0 # Base64 encoded blobs
|
|
332
|
+
whitespace_tokens: int = 0 # Repeated whitespace
|
|
333
|
+
dynamic_date_tokens: int = 0 # Dynamic dates in system prompt
|
|
334
|
+
repetition_tokens: int = 0 # Repeated content
|
|
335
|
+
|
|
336
|
+
def total(self) -> int:
|
|
337
|
+
"""Total waste tokens detected."""
|
|
338
|
+
return (
|
|
339
|
+
self.json_bloat_tokens
|
|
340
|
+
+ self.html_noise_tokens
|
|
341
|
+
+ self.base64_tokens
|
|
342
|
+
+ self.whitespace_tokens
|
|
343
|
+
+ self.dynamic_date_tokens
|
|
344
|
+
+ self.repetition_tokens
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def to_dict(self) -> dict[str, int]:
|
|
348
|
+
"""Convert to dictionary for storage."""
|
|
349
|
+
return {
|
|
350
|
+
"json_bloat": self.json_bloat_tokens,
|
|
351
|
+
"html_noise": self.html_noise_tokens,
|
|
352
|
+
"base64": self.base64_tokens,
|
|
353
|
+
"whitespace": self.whitespace_tokens,
|
|
354
|
+
"dynamic_date": self.dynamic_date_tokens,
|
|
355
|
+
"repetition": self.repetition_tokens,
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
@dataclass
|
|
360
|
+
class CachePrefixMetrics:
|
|
361
|
+
"""Detailed cache prefix metrics for debugging cache misses.
|
|
362
|
+
|
|
363
|
+
Log these per-request to understand why caching is or isn't working.
|
|
364
|
+
Compare stable_prefix_hash across requests - any change means cache miss.
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
stable_prefix_bytes: int # Byte length of static prefix
|
|
368
|
+
stable_prefix_tokens_est: int # Estimated token count of static prefix
|
|
369
|
+
stable_prefix_hash: str # Hash of canonicalized prefix (16 chars)
|
|
370
|
+
prefix_changed: bool # True if hash differs from previous request in session
|
|
371
|
+
previous_hash: str | None = None # Previous hash for comparison (None = first request)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
@dataclass
|
|
375
|
+
class TransformResult:
|
|
376
|
+
"""Output of a transform operation."""
|
|
377
|
+
|
|
378
|
+
messages: list[dict[str, Any]]
|
|
379
|
+
tokens_before: int
|
|
380
|
+
tokens_after: int
|
|
381
|
+
transforms_applied: list[str]
|
|
382
|
+
markers_inserted: list[str] = field(default_factory=list)
|
|
383
|
+
warnings: list[str] = field(default_factory=list)
|
|
384
|
+
diff_artifact: DiffArtifact | None = None # Populated if generate_diff_artifact=True
|
|
385
|
+
cache_metrics: CachePrefixMetrics | None = None # Populated by CacheAligner
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
@dataclass
|
|
389
|
+
class TransformDiff:
|
|
390
|
+
"""Diff info for a single transform (for debugging)."""
|
|
391
|
+
|
|
392
|
+
transform_name: str
|
|
393
|
+
tokens_before: int
|
|
394
|
+
tokens_after: int
|
|
395
|
+
tokens_saved: int
|
|
396
|
+
items_removed: int = 0
|
|
397
|
+
items_kept: int = 0
|
|
398
|
+
details: str = "" # Human-readable description of what changed
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
@dataclass
|
|
402
|
+
class DiffArtifact:
|
|
403
|
+
"""Complete diff artifact for debugging transform pipeline.
|
|
404
|
+
|
|
405
|
+
Opt-in via HeadroomConfig.generate_diff_artifact = True.
|
|
406
|
+
Useful for understanding what each transform did to your messages.
|
|
407
|
+
"""
|
|
408
|
+
|
|
409
|
+
request_id: str
|
|
410
|
+
original_tokens: int
|
|
411
|
+
optimized_tokens: int
|
|
412
|
+
total_tokens_saved: int
|
|
413
|
+
transforms: list[TransformDiff] = field(default_factory=list)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
@dataclass
|
|
417
|
+
class SimulationResult:
|
|
418
|
+
"""Result of a simulation (dry-run)."""
|
|
419
|
+
|
|
420
|
+
tokens_before: int
|
|
421
|
+
tokens_after: int
|
|
422
|
+
tokens_saved: int
|
|
423
|
+
transforms: list[str]
|
|
424
|
+
estimated_savings: str # Human-readable cost estimate
|
|
425
|
+
messages_optimized: list[dict[str, Any]]
|
|
426
|
+
block_breakdown: dict[str, int]
|
|
427
|
+
waste_signals: dict[str, int]
|
|
428
|
+
stable_prefix_hash: str
|
|
429
|
+
cache_alignment_score: float
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
@dataclass
|
|
433
|
+
class RequestMetrics:
|
|
434
|
+
"""Comprehensive metrics for a single request."""
|
|
435
|
+
|
|
436
|
+
request_id: str
|
|
437
|
+
timestamp: datetime
|
|
438
|
+
model: str
|
|
439
|
+
stream: bool
|
|
440
|
+
mode: str # audit | optimize | simulate
|
|
441
|
+
|
|
442
|
+
# Token breakdown
|
|
443
|
+
tokens_input_before: int
|
|
444
|
+
tokens_input_after: int
|
|
445
|
+
tokens_output: int | None = None # None if streaming
|
|
446
|
+
|
|
447
|
+
# Block breakdown
|
|
448
|
+
block_breakdown: dict[str, int] = field(default_factory=dict)
|
|
449
|
+
|
|
450
|
+
# Waste signals
|
|
451
|
+
waste_signals: dict[str, int] = field(default_factory=dict)
|
|
452
|
+
|
|
453
|
+
# Cache metrics (basic)
|
|
454
|
+
stable_prefix_hash: str = ""
|
|
455
|
+
cache_alignment_score: float = 0.0
|
|
456
|
+
cached_tokens: int | None = None # From API response if available
|
|
457
|
+
|
|
458
|
+
# Cache optimizer metrics (provider-specific)
|
|
459
|
+
cache_optimizer_used: str | None = None # e.g., "anthropic-cache-optimizer"
|
|
460
|
+
cache_optimizer_strategy: str | None = None # e.g., "explicit_breakpoints"
|
|
461
|
+
cacheable_tokens: int = 0 # Tokens eligible for caching
|
|
462
|
+
breakpoints_inserted: int = 0 # Cache breakpoints added (Anthropic)
|
|
463
|
+
estimated_cache_hit: bool = False # Whether prefix matches previous
|
|
464
|
+
estimated_savings_percent: float = 0.0 # Estimated savings if cached
|
|
465
|
+
semantic_cache_hit: bool = False # Whether semantic cache was hit
|
|
466
|
+
|
|
467
|
+
# Transform details
|
|
468
|
+
transforms_applied: list[str] = field(default_factory=list)
|
|
469
|
+
tool_units_dropped: int = 0
|
|
470
|
+
turns_dropped: int = 0
|
|
471
|
+
|
|
472
|
+
# For debugging
|
|
473
|
+
messages_hash: str = ""
|
|
474
|
+
error: str | None = None
|
headroom/exceptions.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Custom exceptions for Headroom.
|
|
2
|
+
|
|
3
|
+
This module provides explicit exception classes for better error handling
|
|
4
|
+
and debugging. All exceptions inherit from HeadroomError, making it easy
|
|
5
|
+
to catch all Headroom-related errors.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
from headroom import HeadroomClient, HeadroomError, ConfigurationError
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
client = HeadroomClient(...)
|
|
12
|
+
client.validate_setup()
|
|
13
|
+
except ConfigurationError as e:
|
|
14
|
+
print(f"Configuration problem: {e}")
|
|
15
|
+
except HeadroomError as e:
|
|
16
|
+
print(f"Headroom error: {e}")
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HeadroomError(Exception):
|
|
25
|
+
"""Base exception for all Headroom errors.
|
|
26
|
+
|
|
27
|
+
All Headroom exceptions inherit from this class, making it easy
|
|
28
|
+
to catch any Headroom-related error:
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
client.chat.completions.create(...)
|
|
32
|
+
except HeadroomError as e:
|
|
33
|
+
# Handle any Headroom error
|
|
34
|
+
pass
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, message: str, details: dict[str, Any] | None = None):
|
|
38
|
+
super().__init__(message)
|
|
39
|
+
self.message = message
|
|
40
|
+
self.details = details or {}
|
|
41
|
+
|
|
42
|
+
def __str__(self) -> str:
|
|
43
|
+
if self.details:
|
|
44
|
+
detail_str = ", ".join(f"{k}={v}" for k, v in self.details.items())
|
|
45
|
+
return f"{self.message} ({detail_str})"
|
|
46
|
+
return self.message
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ConfigurationError(HeadroomError):
|
|
50
|
+
"""Raised when Headroom is misconfigured.
|
|
51
|
+
|
|
52
|
+
This includes:
|
|
53
|
+
- Invalid mode values
|
|
54
|
+
- Missing required configuration
|
|
55
|
+
- Incompatible configuration combinations
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
ConfigurationError(
|
|
59
|
+
"Invalid mode 'foo'",
|
|
60
|
+
details={"valid_modes": ["audit", "optimize"]}
|
|
61
|
+
)
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ProviderError(HeadroomError):
|
|
68
|
+
"""Raised when there's an issue with the LLM provider.
|
|
69
|
+
|
|
70
|
+
This includes:
|
|
71
|
+
- Provider not recognized
|
|
72
|
+
- Provider-specific configuration issues
|
|
73
|
+
- Token counter errors
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
ProviderError(
|
|
77
|
+
"Unknown provider",
|
|
78
|
+
details={"provider": "foo", "known_providers": ["openai", "anthropic"]}
|
|
79
|
+
)
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class StorageError(HeadroomError):
|
|
86
|
+
"""Raised when there's an issue with metrics storage.
|
|
87
|
+
|
|
88
|
+
This includes:
|
|
89
|
+
- Database connection failures
|
|
90
|
+
- Invalid storage URL
|
|
91
|
+
- Write failures
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
StorageError(
|
|
95
|
+
"Cannot connect to database",
|
|
96
|
+
details={"url": "sqlite:///foo.db", "error": "Permission denied"}
|
|
97
|
+
)
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class CompressionError(HeadroomError):
|
|
104
|
+
"""Raised when compression fails.
|
|
105
|
+
|
|
106
|
+
This includes:
|
|
107
|
+
- Parse errors in tool outputs
|
|
108
|
+
- Invalid JSON structures
|
|
109
|
+
- Compression strategy failures
|
|
110
|
+
|
|
111
|
+
Example:
|
|
112
|
+
CompressionError(
|
|
113
|
+
"Failed to parse tool output",
|
|
114
|
+
details={"tool_name": "search_api", "content_preview": "..."}
|
|
115
|
+
)
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class TokenizationError(HeadroomError):
|
|
122
|
+
"""Raised when token counting fails.
|
|
123
|
+
|
|
124
|
+
This includes:
|
|
125
|
+
- Unknown model for tokenization
|
|
126
|
+
- Encoding errors
|
|
127
|
+
- Tiktoken/tokenizer loading failures
|
|
128
|
+
|
|
129
|
+
Example:
|
|
130
|
+
TokenizationError(
|
|
131
|
+
"Unknown model for tokenization",
|
|
132
|
+
details={"model": "gpt-99", "fallback_used": True}
|
|
133
|
+
)
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class CacheError(HeadroomError):
|
|
140
|
+
"""Raised when caching operations fail.
|
|
141
|
+
|
|
142
|
+
This includes:
|
|
143
|
+
- Cache store errors
|
|
144
|
+
- Retrieval failures
|
|
145
|
+
- CCR (Compress-Cache-Retrieve) errors
|
|
146
|
+
|
|
147
|
+
Example:
|
|
148
|
+
CacheError(
|
|
149
|
+
"Cache entry expired",
|
|
150
|
+
details={"hash": "abc123", "ttl": 300}
|
|
151
|
+
)
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class ValidationError(HeadroomError):
|
|
158
|
+
"""Raised when setup validation fails.
|
|
159
|
+
|
|
160
|
+
This is raised by validate_setup() when the configuration
|
|
161
|
+
or environment is not properly set up.
|
|
162
|
+
|
|
163
|
+
Example:
|
|
164
|
+
ValidationError(
|
|
165
|
+
"Setup validation failed",
|
|
166
|
+
details={
|
|
167
|
+
"provider_ok": True,
|
|
168
|
+
"storage_ok": False,
|
|
169
|
+
"storage_error": "Cannot write to database"
|
|
170
|
+
}
|
|
171
|
+
)
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class TransformError(HeadroomError):
|
|
178
|
+
"""Raised when a transform fails to apply.
|
|
179
|
+
|
|
180
|
+
This includes:
|
|
181
|
+
- SmartCrusher failures
|
|
182
|
+
- RollingWindow errors
|
|
183
|
+
- Pipeline errors
|
|
184
|
+
|
|
185
|
+
Example:
|
|
186
|
+
TransformError(
|
|
187
|
+
"Transform failed",
|
|
188
|
+
details={"transform": "smart_crusher", "reason": "..."}
|
|
189
|
+
)
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
pass
|