headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
headroom/client.py ADDED
@@ -0,0 +1,977 @@
1
+ """Main HeadroomClient implementation for Headroom SDK."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterator
6
+ from datetime import datetime
7
+ from typing import Any
8
+
9
+ from .cache import (
10
+ BaseCacheOptimizer,
11
+ CacheConfig,
12
+ CacheOptimizerRegistry,
13
+ OptimizationContext,
14
+ SemanticCacheLayer,
15
+ )
16
+ from .config import (
17
+ HeadroomConfig,
18
+ HeadroomMode,
19
+ RequestMetrics,
20
+ SimulationResult,
21
+ )
22
+ from .parser import parse_messages
23
+ from .providers.base import Provider
24
+ from .storage import create_storage
25
+ from .tokenizer import Tokenizer
26
+ from .transforms import CacheAligner, TransformPipeline
27
+ from .utils import (
28
+ compute_messages_hash,
29
+ compute_prefix_hash,
30
+ estimate_cost,
31
+ format_cost,
32
+ generate_request_id,
33
+ )
34
+
35
+
36
+ class ChatCompletions:
37
+ """Wrapper for chat.completions API (OpenAI-style)."""
38
+
39
+ def __init__(self, client: HeadroomClient):
40
+ self._client = client
41
+
42
+ def create(
43
+ self,
44
+ *,
45
+ model: str,
46
+ messages: list[dict[str, Any]],
47
+ stream: bool = False,
48
+ # Headroom-specific parameters
49
+ headroom_mode: str | None = None,
50
+ headroom_cache_prefix_tokens: int | None = None,
51
+ headroom_output_buffer_tokens: int | None = None,
52
+ headroom_keep_turns: int | None = None,
53
+ headroom_tool_profiles: dict[str, dict[str, Any]] | None = None,
54
+ # Pass through all other kwargs
55
+ **kwargs: Any,
56
+ ) -> Any:
57
+ """
58
+ Create a chat completion with optional Headroom optimization.
59
+
60
+ Args:
61
+ model: Model name.
62
+ messages: List of messages.
63
+ stream: Whether to stream the response.
64
+ headroom_mode: Override default mode ("audit" | "optimize").
65
+ headroom_cache_prefix_tokens: Target cache-aligned prefix size.
66
+ headroom_output_buffer_tokens: Reserve tokens for output.
67
+ headroom_keep_turns: Never drop last N turns.
68
+ headroom_tool_profiles: Per-tool compression config.
69
+ **kwargs: Additional arguments passed to underlying client.
70
+
71
+ Returns:
72
+ Chat completion response (or stream iterator).
73
+ """
74
+ return self._client._create(
75
+ model=model,
76
+ messages=messages,
77
+ stream=stream,
78
+ headroom_mode=headroom_mode,
79
+ headroom_cache_prefix_tokens=headroom_cache_prefix_tokens,
80
+ headroom_output_buffer_tokens=headroom_output_buffer_tokens,
81
+ headroom_keep_turns=headroom_keep_turns,
82
+ headroom_tool_profiles=headroom_tool_profiles,
83
+ api_style="openai",
84
+ **kwargs,
85
+ )
86
+
87
+ def simulate(
88
+ self,
89
+ *,
90
+ model: str,
91
+ messages: list[dict[str, Any]],
92
+ headroom_mode: str = "optimize",
93
+ headroom_output_buffer_tokens: int | None = None,
94
+ headroom_tool_profiles: dict[str, dict[str, Any]] | None = None,
95
+ **kwargs: Any,
96
+ ) -> SimulationResult:
97
+ """
98
+ Simulate optimization without calling the API.
99
+
100
+ Args:
101
+ model: Model name.
102
+ messages: List of messages.
103
+ headroom_mode: Mode to simulate.
104
+ headroom_output_buffer_tokens: Output buffer to use.
105
+ headroom_tool_profiles: Tool profiles to use.
106
+ **kwargs: Additional arguments (ignored).
107
+
108
+ Returns:
109
+ SimulationResult with projected changes.
110
+ """
111
+ return self._client._simulate(
112
+ model=model,
113
+ messages=messages,
114
+ headroom_mode=headroom_mode,
115
+ headroom_output_buffer_tokens=headroom_output_buffer_tokens,
116
+ headroom_tool_profiles=headroom_tool_profiles,
117
+ )
118
+
119
+
120
+ class Messages:
121
+ """Wrapper for messages API (Anthropic-style)."""
122
+
123
+ def __init__(self, client: HeadroomClient):
124
+ self._client = client
125
+
126
+ def create(
127
+ self,
128
+ *,
129
+ model: str,
130
+ messages: list[dict[str, Any]],
131
+ max_tokens: int = 1024,
132
+ # Headroom-specific parameters
133
+ headroom_mode: str | None = None,
134
+ headroom_cache_prefix_tokens: int | None = None,
135
+ headroom_output_buffer_tokens: int | None = None,
136
+ headroom_keep_turns: int | None = None,
137
+ headroom_tool_profiles: dict[str, dict[str, Any]] | None = None,
138
+ # Pass through all other kwargs
139
+ **kwargs: Any,
140
+ ) -> Any:
141
+ """
142
+ Create a message with optional Headroom optimization.
143
+
144
+ Args:
145
+ model: Model name.
146
+ messages: List of messages.
147
+ max_tokens: Maximum tokens in response.
148
+ headroom_mode: Override default mode ("audit" | "optimize").
149
+ headroom_cache_prefix_tokens: Target cache-aligned prefix size.
150
+ headroom_output_buffer_tokens: Reserve tokens for output.
151
+ headroom_keep_turns: Never drop last N turns.
152
+ headroom_tool_profiles: Per-tool compression config.
153
+ **kwargs: Additional arguments passed to underlying client.
154
+
155
+ Returns:
156
+ Message response.
157
+ """
158
+ return self._client._create(
159
+ model=model,
160
+ messages=messages,
161
+ stream=False,
162
+ headroom_mode=headroom_mode,
163
+ headroom_cache_prefix_tokens=headroom_cache_prefix_tokens,
164
+ headroom_output_buffer_tokens=headroom_output_buffer_tokens,
165
+ headroom_keep_turns=headroom_keep_turns,
166
+ headroom_tool_profiles=headroom_tool_profiles,
167
+ api_style="anthropic",
168
+ max_tokens=max_tokens,
169
+ **kwargs,
170
+ )
171
+
172
+ def stream(
173
+ self,
174
+ *,
175
+ model: str,
176
+ messages: list[dict[str, Any]],
177
+ max_tokens: int = 1024,
178
+ # Headroom-specific parameters
179
+ headroom_mode: str | None = None,
180
+ headroom_cache_prefix_tokens: int | None = None,
181
+ headroom_output_buffer_tokens: int | None = None,
182
+ headroom_keep_turns: int | None = None,
183
+ headroom_tool_profiles: dict[str, dict[str, Any]] | None = None,
184
+ # Pass through all other kwargs
185
+ **kwargs: Any,
186
+ ) -> Any:
187
+ """
188
+ Stream a message with optional Headroom optimization.
189
+
190
+ Args:
191
+ model: Model name.
192
+ messages: List of messages.
193
+ max_tokens: Maximum tokens in response.
194
+ headroom_mode: Override default mode ("audit" | "optimize").
195
+ headroom_cache_prefix_tokens: Target cache-aligned prefix size.
196
+ headroom_output_buffer_tokens: Reserve tokens for output.
197
+ headroom_keep_turns: Never drop last N turns.
198
+ headroom_tool_profiles: Per-tool compression config.
199
+ **kwargs: Additional arguments passed to underlying client.
200
+
201
+ Returns:
202
+ Stream context manager.
203
+ """
204
+ return self._client._create(
205
+ model=model,
206
+ messages=messages,
207
+ stream=True,
208
+ headroom_mode=headroom_mode,
209
+ headroom_cache_prefix_tokens=headroom_cache_prefix_tokens,
210
+ headroom_output_buffer_tokens=headroom_output_buffer_tokens,
211
+ headroom_keep_turns=headroom_keep_turns,
212
+ headroom_tool_profiles=headroom_tool_profiles,
213
+ api_style="anthropic",
214
+ max_tokens=max_tokens,
215
+ **kwargs,
216
+ )
217
+
218
+ def simulate(
219
+ self,
220
+ *,
221
+ model: str,
222
+ messages: list[dict[str, Any]],
223
+ headroom_mode: str = "optimize",
224
+ headroom_output_buffer_tokens: int | None = None,
225
+ headroom_tool_profiles: dict[str, dict[str, Any]] | None = None,
226
+ **kwargs: Any,
227
+ ) -> SimulationResult:
228
+ """
229
+ Simulate optimization without calling the API.
230
+
231
+ Args:
232
+ model: Model name.
233
+ messages: List of messages.
234
+ headroom_mode: Mode to simulate.
235
+ headroom_output_buffer_tokens: Output buffer to use.
236
+ headroom_tool_profiles: Tool profiles to use.
237
+ **kwargs: Additional arguments (ignored).
238
+
239
+ Returns:
240
+ SimulationResult with projected changes.
241
+ """
242
+ return self._client._simulate(
243
+ model=model,
244
+ messages=messages,
245
+ headroom_mode=headroom_mode,
246
+ headroom_output_buffer_tokens=headroom_output_buffer_tokens,
247
+ headroom_tool_profiles=headroom_tool_profiles,
248
+ )
249
+
250
+
251
+ class HeadroomClient:
252
+ """
253
+ Context Budget Controller wrapper for LLM API clients.
254
+
255
+ Provides automatic context optimization, waste detection, and
256
+ cache alignment while maintaining API compatibility.
257
+ """
258
+
259
+ def __init__(
260
+ self,
261
+ original_client: Any,
262
+ provider: Provider,
263
+ store_url: str | None = None,
264
+ default_mode: str = "audit",
265
+ model_context_limits: dict[str, int] | None = None,
266
+ cache_optimizer: BaseCacheOptimizer | None = None,
267
+ enable_cache_optimizer: bool = True,
268
+ enable_semantic_cache: bool = False,
269
+ ):
270
+ """
271
+ Initialize HeadroomClient.
272
+
273
+ Args:
274
+ original_client: The underlying LLM client (OpenAI-compatible).
275
+ provider: Provider instance for model-specific behavior.
276
+ store_url: Storage URL (sqlite:// or jsonl://). Defaults to temp dir.
277
+ default_mode: Default mode ("audit" | "optimize").
278
+ model_context_limits: Override context limits for models.
279
+ cache_optimizer: Optional custom cache optimizer. If None and
280
+ enable_cache_optimizer=True, auto-detects from provider.
281
+ enable_cache_optimizer: Enable provider-specific cache optimization.
282
+ enable_semantic_cache: Enable query-level semantic caching.
283
+ """
284
+ self._original = original_client
285
+ self._provider = provider
286
+
287
+ # Set default store_url to temp directory for better DevEx
288
+ if store_url is None:
289
+ import os
290
+ import tempfile
291
+
292
+ db_path = os.path.join(tempfile.gettempdir(), "headroom.db")
293
+ store_url = f"sqlite:///{db_path}"
294
+
295
+ self._store_url = store_url
296
+ self._default_mode = HeadroomMode(default_mode)
297
+
298
+ # Build config
299
+ self._config = HeadroomConfig()
300
+ self._config.store_url = store_url
301
+ self._config.default_mode = self._default_mode
302
+ self._config.cache_optimizer.enabled = enable_cache_optimizer
303
+ self._config.cache_optimizer.enable_semantic_cache = enable_semantic_cache
304
+
305
+ if model_context_limits:
306
+ self._config.model_context_limits.update(model_context_limits)
307
+
308
+ # Initialize storage
309
+ self._storage = create_storage(store_url)
310
+
311
+ # Initialize transform pipeline
312
+ self._pipeline = TransformPipeline(self._config, provider=self._provider)
313
+
314
+ # Initialize cache optimizer
315
+ self._cache_optimizer: BaseCacheOptimizer | None = None
316
+ self._semantic_cache_layer: SemanticCacheLayer | None = None
317
+
318
+ if enable_cache_optimizer:
319
+ if cache_optimizer is not None:
320
+ self._cache_optimizer = cache_optimizer
321
+ else:
322
+ # Auto-detect from provider
323
+ provider_name = self._provider.name.lower()
324
+ if CacheOptimizerRegistry.is_registered(provider_name):
325
+ cache_config = CacheConfig(
326
+ min_cacheable_tokens=self._config.cache_optimizer.min_cacheable_tokens,
327
+ )
328
+ self._cache_optimizer = CacheOptimizerRegistry.get(
329
+ provider_name,
330
+ config=cache_config,
331
+ )
332
+
333
+ # Wrap with semantic cache if enabled
334
+ if enable_semantic_cache and self._cache_optimizer is not None:
335
+ self._semantic_cache_layer = SemanticCacheLayer(
336
+ self._cache_optimizer,
337
+ similarity_threshold=self._config.cache_optimizer.semantic_cache_similarity,
338
+ max_entries=self._config.cache_optimizer.semantic_cache_max_entries,
339
+ ttl_seconds=self._config.cache_optimizer.semantic_cache_ttl_seconds,
340
+ )
341
+
342
+ # Public API - OpenAI style
343
+ self.chat = type("Chat", (), {"completions": ChatCompletions(self)})()
344
+ # Public API - Anthropic style
345
+ self.messages = Messages(self)
346
+
347
+ def _get_tokenizer(self, model: str) -> Tokenizer:
348
+ """Get tokenizer for model using provider."""
349
+ token_counter = self._provider.get_token_counter(model)
350
+ return Tokenizer(token_counter, model)
351
+
352
+ def _get_context_limit(self, model: str) -> int:
353
+ """Get context limit from user config or provider."""
354
+ # User override takes precedence
355
+ limit = self._config.get_context_limit(model)
356
+ if limit is not None:
357
+ return limit
358
+ # Fall back to provider
359
+ return self._provider.get_context_limit(model)
360
+
361
+ def _create(
362
+ self,
363
+ *,
364
+ model: str,
365
+ messages: list[dict[str, Any]],
366
+ stream: bool = False,
367
+ headroom_mode: str | None = None,
368
+ headroom_cache_prefix_tokens: int | None = None,
369
+ headroom_output_buffer_tokens: int | None = None,
370
+ headroom_keep_turns: int | None = None,
371
+ headroom_tool_profiles: dict[str, dict[str, Any]] | None = None,
372
+ api_style: str = "openai",
373
+ **kwargs: Any,
374
+ ) -> Any:
375
+ """Internal implementation of create."""
376
+ request_id = generate_request_id()
377
+ timestamp = datetime.utcnow()
378
+ mode = HeadroomMode(headroom_mode) if headroom_mode else self._default_mode
379
+
380
+ tokenizer = self._get_tokenizer(model)
381
+
382
+ # Analyze original messages
383
+ blocks, block_breakdown, waste_signals = parse_messages(messages, tokenizer)
384
+ tokens_before = tokenizer.count_messages(messages)
385
+
386
+ # Compute cache alignment score
387
+ aligner = CacheAligner(self._config.cache_aligner)
388
+ cache_alignment_score = aligner.get_alignment_score(messages)
389
+
390
+ # Compute stable prefix hash
391
+ stable_prefix_hash = compute_prefix_hash(messages)
392
+
393
+ # Cache optimizer metrics (populated later if optimizer is used)
394
+ cache_optimizer_used = None
395
+ cache_optimizer_strategy = None
396
+ cacheable_tokens = 0
397
+ breakpoints_inserted = 0
398
+ estimated_cache_hit = False
399
+ estimated_savings_percent = 0.0
400
+ semantic_cache_hit = False
401
+ cached_response = None
402
+
403
+ # Apply transforms if in optimize mode
404
+ if mode == HeadroomMode.OPTIMIZE:
405
+ output_buffer = (
406
+ headroom_output_buffer_tokens or self._config.rolling_window.output_buffer_tokens
407
+ )
408
+ model_limit = self._get_context_limit(model)
409
+
410
+ result = self._pipeline.apply(
411
+ messages,
412
+ model,
413
+ model_limit=model_limit,
414
+ output_buffer=output_buffer,
415
+ tool_profiles=headroom_tool_profiles or {},
416
+ )
417
+
418
+ optimized_messages = result.messages
419
+ tokens_after = result.tokens_after
420
+ transforms_applied = result.transforms_applied
421
+
422
+ # Apply provider-specific cache optimization
423
+ if self._cache_optimizer is not None or self._semantic_cache_layer is not None:
424
+ cache_context = OptimizationContext(
425
+ provider=self._provider.name.lower(),
426
+ model=model,
427
+ query=self._extract_query(optimized_messages),
428
+ )
429
+
430
+ # Check semantic cache first (if enabled)
431
+ if self._semantic_cache_layer is not None:
432
+ cache_result = self._semantic_cache_layer.process(
433
+ optimized_messages, cache_context
434
+ )
435
+ semantic_cache_hit = cache_result.semantic_cache_hit
436
+ if semantic_cache_hit:
437
+ cached_response = cache_result.cached_response
438
+
439
+ # Update metrics from cache result
440
+ cache_optimizer_used = getattr(
441
+ cache_result.metrics, "optimizer_name", None
442
+ ) or (self._cache_optimizer.name if self._cache_optimizer else "")
443
+ cache_optimizer_strategy = getattr(cache_result.metrics, "strategy", "")
444
+ cacheable_tokens = cache_result.metrics.cacheable_tokens
445
+ breakpoints_inserted = cache_result.metrics.breakpoints_inserted
446
+ estimated_cache_hit = cache_result.metrics.estimated_cache_hit
447
+ estimated_savings_percent = cache_result.metrics.estimated_savings_percent
448
+
449
+ # Apply optimized messages (with cache_control blocks for Anthropic)
450
+ if cache_result.messages:
451
+ optimized_messages = cache_result.messages
452
+
453
+ elif self._cache_optimizer is not None:
454
+ # Direct cache optimizer (no semantic layer)
455
+ cache_result = self._cache_optimizer.optimize(optimized_messages, cache_context)
456
+ cache_optimizer_used = self._cache_optimizer.name
457
+ cache_optimizer_strategy = self._cache_optimizer.strategy.value
458
+ cacheable_tokens = cache_result.metrics.cacheable_tokens
459
+ breakpoints_inserted = cache_result.metrics.breakpoints_inserted
460
+ estimated_cache_hit = cache_result.metrics.estimated_cache_hit
461
+ estimated_savings_percent = cache_result.metrics.estimated_savings_percent
462
+
463
+ if cache_result.messages:
464
+ optimized_messages = cache_result.messages
465
+
466
+ transforms_applied.extend(
467
+ f"cache_optimizer:{t}" for t in (cache_result.transforms_applied or [])
468
+ )
469
+
470
+ # Recalculate prefix hash after optimization
471
+ stable_prefix_hash = compute_prefix_hash(optimized_messages)
472
+ else:
473
+ # Audit mode - no changes
474
+ optimized_messages = messages
475
+ tokens_after = tokens_before
476
+ transforms_applied = []
477
+
478
+ # Create metrics
479
+ metrics = RequestMetrics(
480
+ request_id=request_id,
481
+ timestamp=timestamp,
482
+ model=model,
483
+ stream=stream,
484
+ mode=mode.value,
485
+ tokens_input_before=tokens_before,
486
+ tokens_input_after=tokens_after,
487
+ block_breakdown=block_breakdown,
488
+ waste_signals=waste_signals.to_dict(),
489
+ stable_prefix_hash=stable_prefix_hash,
490
+ cache_alignment_score=cache_alignment_score,
491
+ transforms_applied=transforms_applied,
492
+ messages_hash=compute_messages_hash(messages),
493
+ # Cache optimizer metrics
494
+ cache_optimizer_used=cache_optimizer_used,
495
+ cache_optimizer_strategy=cache_optimizer_strategy,
496
+ cacheable_tokens=cacheable_tokens,
497
+ breakpoints_inserted=breakpoints_inserted,
498
+ estimated_cache_hit=estimated_cache_hit,
499
+ estimated_savings_percent=estimated_savings_percent,
500
+ semantic_cache_hit=semantic_cache_hit,
501
+ )
502
+
503
+ # Update session stats
504
+ self._update_session_stats(
505
+ mode=mode,
506
+ tokens_before=tokens_before,
507
+ tokens_after=tokens_after,
508
+ cache_hit=semantic_cache_hit,
509
+ )
510
+
511
+ # Return cached response if semantic cache hit
512
+ if semantic_cache_hit and cached_response is not None:
513
+ self._storage.save(metrics)
514
+ return cached_response
515
+
516
+ # Call underlying client based on API style
517
+ try:
518
+ if api_style == "anthropic":
519
+ return self._call_anthropic(
520
+ model=model,
521
+ messages=optimized_messages,
522
+ stream=stream,
523
+ metrics=metrics,
524
+ **kwargs,
525
+ )
526
+ else:
527
+ return self._call_openai(
528
+ model=model,
529
+ messages=optimized_messages,
530
+ stream=stream,
531
+ metrics=metrics,
532
+ **kwargs,
533
+ )
534
+
535
+ except Exception as e:
536
+ metrics.error = str(e)
537
+ self._storage.save(metrics)
538
+ raise
539
+
540
+ def _call_openai(
541
+ self,
542
+ *,
543
+ model: str,
544
+ messages: list[dict[str, Any]],
545
+ stream: bool,
546
+ metrics: RequestMetrics,
547
+ **kwargs: Any,
548
+ ) -> Any:
549
+ """Call OpenAI-style API."""
550
+ if stream:
551
+ response = self._original.chat.completions.create(
552
+ model=model,
553
+ messages=messages,
554
+ stream=True,
555
+ **kwargs,
556
+ )
557
+ return self._wrap_stream(response, metrics)
558
+ else:
559
+ response = self._original.chat.completions.create(
560
+ model=model,
561
+ messages=messages,
562
+ stream=False,
563
+ **kwargs,
564
+ )
565
+
566
+ # Extract output tokens from response
567
+ if hasattr(response, "usage") and response.usage:
568
+ metrics.tokens_output = response.usage.completion_tokens
569
+ # Check for cached tokens in usage
570
+ if hasattr(response.usage, "prompt_tokens_details"):
571
+ details = response.usage.prompt_tokens_details
572
+ if hasattr(details, "cached_tokens"):
573
+ metrics.cached_tokens = details.cached_tokens
574
+
575
+ self._storage.save(metrics)
576
+ return response
577
+
578
+ def _call_anthropic(
579
+ self,
580
+ *,
581
+ model: str,
582
+ messages: list[dict[str, Any]],
583
+ stream: bool,
584
+ metrics: RequestMetrics,
585
+ **kwargs: Any,
586
+ ) -> Any:
587
+ """Call Anthropic-style API."""
588
+ if stream:
589
+ # Anthropic streaming returns a context manager
590
+ stream_manager = self._original.messages.stream(
591
+ model=model,
592
+ messages=messages,
593
+ **kwargs,
594
+ )
595
+ # Save metrics when stream is created
596
+ self._storage.save(metrics)
597
+ return stream_manager
598
+ else:
599
+ response = self._original.messages.create(
600
+ model=model,
601
+ messages=messages,
602
+ **kwargs,
603
+ )
604
+
605
+ # Extract output tokens from Anthropic response
606
+ if hasattr(response, "usage") and response.usage:
607
+ metrics.tokens_output = response.usage.output_tokens
608
+ # Check for cached tokens in Anthropic usage
609
+ if hasattr(response.usage, "cache_read_input_tokens"):
610
+ metrics.cached_tokens = response.usage.cache_read_input_tokens
611
+
612
+ self._storage.save(metrics)
613
+ return response
614
+
615
+ def _wrap_stream(
616
+ self,
617
+ stream: Iterator[Any],
618
+ metrics: RequestMetrics,
619
+ ) -> Iterator[Any]:
620
+ """Wrap stream to pass through chunks and save metrics at end."""
621
+ try:
622
+ yield from stream
623
+ finally:
624
+ # Save metrics when stream completes
625
+ # Note: output tokens unknown for streams
626
+ self._storage.save(metrics)
627
+
628
+ def _extract_query(self, messages: list[dict[str, Any]]) -> str:
629
+ """Extract query from messages for semantic caching.
630
+
631
+ Returns the last user message content as the query.
632
+ """
633
+ for msg in reversed(messages):
634
+ if msg.get("role") == "user":
635
+ content = msg.get("content", "")
636
+ if isinstance(content, str):
637
+ return content
638
+ elif isinstance(content, list):
639
+ # Content block format
640
+ for block in content:
641
+ if isinstance(block, dict) and block.get("type") == "text":
642
+ text_val = block.get("text", "")
643
+ return str(text_val) if text_val else ""
644
+ return ""
645
+ return ""
646
+
647
+ def _store_response_in_semantic_cache(
648
+ self,
649
+ messages: list[dict[str, Any]],
650
+ response: Any,
651
+ model: str,
652
+ ) -> None:
653
+ """Store response in semantic cache for future hits."""
654
+ if self._semantic_cache_layer is not None:
655
+ cache_context = OptimizationContext(
656
+ provider=self._provider.name.lower(),
657
+ model=model,
658
+ query=self._extract_query(messages),
659
+ )
660
+ # Extract response content for caching
661
+ response_data = self._extract_response_content(response)
662
+ if response_data:
663
+ self._semantic_cache_layer.store_response(messages, response_data, cache_context)
664
+
665
+ def _extract_response_content(self, response: Any) -> dict[str, Any] | None:
666
+ """Extract cacheable content from API response."""
667
+ try:
668
+ # OpenAI format
669
+ if hasattr(response, "choices") and response.choices:
670
+ choice = response.choices[0]
671
+ if hasattr(choice, "message"):
672
+ return {
673
+ "role": "assistant",
674
+ "content": choice.message.content,
675
+ }
676
+ # Anthropic format
677
+ elif hasattr(response, "content"):
678
+ return {
679
+ "role": "assistant",
680
+ "content": response.content,
681
+ }
682
+ except Exception:
683
+ pass
684
+ return None
685
+
686
+ def _simulate(
687
+ self,
688
+ *,
689
+ model: str,
690
+ messages: list[dict[str, Any]],
691
+ headroom_mode: str = "optimize",
692
+ headroom_output_buffer_tokens: int | None = None,
693
+ headroom_tool_profiles: dict[str, dict[str, Any]] | None = None,
694
+ ) -> SimulationResult:
695
+ """Internal implementation of simulate."""
696
+ tokenizer = self._get_tokenizer(model)
697
+
698
+ # Analyze original
699
+ blocks, block_breakdown, waste_signals = parse_messages(messages, tokenizer)
700
+ tokens_before = tokenizer.count_messages(messages)
701
+
702
+ # Compute original cache alignment
703
+ aligner = CacheAligner(self._config.cache_aligner)
704
+ cache_alignment_score = aligner.get_alignment_score(messages)
705
+ compute_prefix_hash(messages)
706
+
707
+ # Apply transforms
708
+ output_buffer = (
709
+ headroom_output_buffer_tokens or self._config.rolling_window.output_buffer_tokens
710
+ )
711
+ model_limit = self._get_context_limit(model)
712
+
713
+ result = self._pipeline.simulate(
714
+ messages,
715
+ model,
716
+ model_limit=model_limit,
717
+ output_buffer=output_buffer,
718
+ tool_profiles=headroom_tool_profiles or {},
719
+ )
720
+
721
+ tokens_saved = tokens_before - result.tokens_after
722
+
723
+ # Estimate cost savings using provider
724
+ cost_before = estimate_cost(tokens_before, 500, model, provider=self._provider)
725
+ cost_after = estimate_cost(result.tokens_after, 500, model, provider=self._provider)
726
+
727
+ if cost_before is not None and cost_after is not None:
728
+ savings = format_cost(cost_before - cost_after)
729
+ else:
730
+ savings = "N/A"
731
+
732
+ # Recalculate prefix hash after optimization
733
+ optimized_prefix_hash = compute_prefix_hash(result.messages)
734
+
735
+ return SimulationResult(
736
+ tokens_before=tokens_before,
737
+ tokens_after=result.tokens_after,
738
+ tokens_saved=tokens_saved,
739
+ transforms=result.transforms_applied,
740
+ estimated_savings=f"{savings} per request",
741
+ messages_optimized=result.messages,
742
+ block_breakdown=block_breakdown,
743
+ waste_signals=waste_signals.to_dict(),
744
+ stable_prefix_hash=optimized_prefix_hash,
745
+ cache_alignment_score=cache_alignment_score,
746
+ )
747
+
748
+ def get_metrics(
749
+ self,
750
+ start_time: datetime | None = None,
751
+ end_time: datetime | None = None,
752
+ model: str | None = None,
753
+ mode: str | None = None,
754
+ limit: int = 100,
755
+ ) -> list[RequestMetrics]:
756
+ """
757
+ Query stored metrics.
758
+
759
+ Args:
760
+ start_time: Filter by timestamp >= start_time.
761
+ end_time: Filter by timestamp <= end_time.
762
+ model: Filter by model name.
763
+ mode: Filter by mode.
764
+ limit: Maximum results.
765
+
766
+ Returns:
767
+ List of RequestMetrics.
768
+ """
769
+ return self._storage.query(
770
+ start_time=start_time,
771
+ end_time=end_time,
772
+ model=model,
773
+ mode=mode,
774
+ limit=limit,
775
+ )
776
+
777
+ def get_summary(
778
+ self,
779
+ start_time: datetime | None = None,
780
+ end_time: datetime | None = None,
781
+ ) -> dict[str, Any]:
782
+ """
783
+ Get summary statistics.
784
+
785
+ Args:
786
+ start_time: Filter by timestamp >= start_time.
787
+ end_time: Filter by timestamp <= end_time.
788
+
789
+ Returns:
790
+ Summary statistics dict.
791
+ """
792
+ return self._storage.get_summary_stats(start_time, end_time)
793
+
794
+ def close(self) -> None:
795
+ """Close storage connection."""
796
+ self._storage.close()
797
+
798
+ def __enter__(self) -> HeadroomClient:
799
+ """Context manager entry."""
800
+ return self
801
+
802
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
803
+ """Context manager exit."""
804
+ self.close()
805
+
806
+ def validate_setup(self) -> dict[str, Any]:
807
+ """Validate that Headroom is properly configured.
808
+
809
+ This method checks:
810
+ - Provider is valid and can count tokens
811
+ - Storage is accessible and writable
812
+ - Configuration is valid
813
+ - Cache optimizer (if enabled) is working
814
+
815
+ Returns:
816
+ dict with validation results:
817
+ {
818
+ "valid": True/False,
819
+ "provider": {"ok": bool, "name": str, "error": str | None},
820
+ "storage": {"ok": bool, "url": str, "error": str | None},
821
+ "config": {"ok": bool, "mode": str, "error": str | None},
822
+ "cache_optimizer": {"ok": bool, "name": str | None, "error": str | None},
823
+ }
824
+
825
+ Raises:
826
+ ValidationError: If validation fails and raise_on_error=True.
827
+
828
+ Example:
829
+ client = HeadroomClient(...)
830
+ result = client.validate_setup()
831
+ if not result["valid"]:
832
+ print("Setup issues:", result)
833
+ """
834
+ result: dict[str, Any] = {
835
+ "valid": True,
836
+ "provider": {"ok": False, "name": None, "error": None},
837
+ "storage": {"ok": False, "url": self._store_url, "error": None},
838
+ "config": {"ok": False, "mode": self._default_mode.value, "error": None},
839
+ "cache_optimizer": {"ok": True, "name": None, "error": None},
840
+ }
841
+
842
+ # Validate provider
843
+ try:
844
+ result["provider"]["name"] = self._provider.name
845
+ # Test token counting
846
+ test_messages = [{"role": "user", "content": "test"}]
847
+ tokenizer = self._get_tokenizer("gpt-4")
848
+ count = tokenizer.count_messages(test_messages)
849
+ if count > 0:
850
+ result["provider"]["ok"] = True
851
+ else:
852
+ result["provider"]["error"] = "Token count returned 0"
853
+ result["valid"] = False
854
+ except Exception as e:
855
+ result["provider"]["error"] = str(e)
856
+ result["valid"] = False
857
+
858
+ # Validate storage
859
+ try:
860
+ # Try to get summary (tests read)
861
+ self._storage.get_summary_stats()
862
+ result["storage"]["ok"] = True
863
+ except Exception as e:
864
+ result["storage"]["error"] = str(e)
865
+ result["valid"] = False
866
+
867
+ # Validate config
868
+ try:
869
+ # Check mode is valid
870
+ if self._default_mode in (HeadroomMode.AUDIT, HeadroomMode.OPTIMIZE):
871
+ result["config"]["ok"] = True
872
+ else:
873
+ result["config"]["error"] = f"Invalid mode: {self._default_mode}"
874
+ result["valid"] = False
875
+ except Exception as e:
876
+ result["config"]["error"] = str(e)
877
+ result["valid"] = False
878
+
879
+ # Validate cache optimizer (if enabled)
880
+ if self._cache_optimizer is not None:
881
+ try:
882
+ result["cache_optimizer"]["name"] = self._cache_optimizer.name
883
+ result["cache_optimizer"]["ok"] = True
884
+ except Exception as e:
885
+ result["cache_optimizer"]["error"] = str(e)
886
+ # Don't fail validation for cache optimizer issues
887
+ elif self._config.cache_optimizer.enabled:
888
+ result["cache_optimizer"]["error"] = "Enabled but no optimizer loaded"
889
+ # Don't fail validation, just warn
890
+
891
+ return result
892
+
893
+ def get_stats(self) -> dict[str, Any]:
894
+ """Get quick statistics without database query.
895
+
896
+ This returns in-memory stats tracked during this session.
897
+ For historical metrics, use get_metrics() or get_summary().
898
+
899
+ Returns:
900
+ dict with session statistics:
901
+ {
902
+ "session": {
903
+ "requests_total": int,
904
+ "requests_optimized": int,
905
+ "requests_audit": int,
906
+ "tokens_saved_total": int,
907
+ "cache_hits": int,
908
+ },
909
+ "config": {
910
+ "mode": str,
911
+ "provider": str,
912
+ "cache_optimizer": str | None,
913
+ "semantic_cache": bool,
914
+ },
915
+ "transforms": {
916
+ "smart_crusher_enabled": bool,
917
+ "rolling_window_enabled": bool,
918
+ "cache_aligner_enabled": bool,
919
+ },
920
+ }
921
+
922
+ Example:
923
+ stats = client.get_stats()
924
+ print(f"Saved {stats['session']['tokens_saved_total']} tokens this session")
925
+ """
926
+ # Initialize session stats if not present
927
+ if not hasattr(self, "_session_stats"):
928
+ self._session_stats = {
929
+ "requests_total": 0,
930
+ "requests_optimized": 0,
931
+ "requests_audit": 0,
932
+ "tokens_saved_total": 0,
933
+ "cache_hits": 0,
934
+ }
935
+
936
+ return {
937
+ "session": dict(self._session_stats),
938
+ "config": {
939
+ "mode": self._default_mode.value,
940
+ "provider": self._provider.name,
941
+ "cache_optimizer": (self._cache_optimizer.name if self._cache_optimizer else None),
942
+ "semantic_cache": self._semantic_cache_layer is not None,
943
+ },
944
+ "transforms": {
945
+ "smart_crusher_enabled": self._config.smart_crusher.enabled,
946
+ "rolling_window_enabled": self._config.rolling_window.enabled,
947
+ "cache_aligner_enabled": self._config.cache_aligner.enabled,
948
+ },
949
+ }
950
+
951
+ def _update_session_stats(
952
+ self,
953
+ mode: HeadroomMode,
954
+ tokens_before: int,
955
+ tokens_after: int,
956
+ cache_hit: bool = False,
957
+ ) -> None:
958
+ """Update in-memory session statistics."""
959
+ if not hasattr(self, "_session_stats"):
960
+ self._session_stats = {
961
+ "requests_total": 0,
962
+ "requests_optimized": 0,
963
+ "requests_audit": 0,
964
+ "tokens_saved_total": 0,
965
+ "cache_hits": 0,
966
+ }
967
+
968
+ self._session_stats["requests_total"] += 1
969
+
970
+ if mode == HeadroomMode.OPTIMIZE:
971
+ self._session_stats["requests_optimized"] += 1
972
+ self._session_stats["tokens_saved_total"] += max(0, tokens_before - tokens_after)
973
+ else:
974
+ self._session_stats["requests_audit"] += 1
975
+
976
+ if cache_hit:
977
+ self._session_stats["cache_hits"] += 1