headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,2683 @@
1
+ """Headroom Proxy Server - Production Ready.
2
+
3
+ A full-featured LLM proxy with optimization, caching, rate limiting,
4
+ and observability.
5
+
6
+ Features:
7
+ - Context optimization (SmartCrusher, CacheAligner, RollingWindow)
8
+ - Semantic caching (save costs on repeated queries)
9
+ - Rate limiting (token bucket)
10
+ - Retry with exponential backoff
11
+ - Cost tracking and budgets
12
+ - Request tagging and metadata
13
+ - Provider fallback
14
+ - Prometheus metrics
15
+ - Full request/response logging
16
+
17
+ Usage:
18
+ python -m headroom.proxy.server --port 8787
19
+
20
+ # With Claude Code:
21
+ ANTHROPIC_BASE_URL=http://localhost:8787 claude
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import asyncio
28
+ import hashlib
29
+ import json
30
+ import logging
31
+ import os
32
+ import random
33
+ import sys
34
+ import time
35
+ from collections import OrderedDict, defaultdict, deque
36
+ from dataclasses import asdict, dataclass
37
+ from datetime import datetime, timedelta
38
+ from pathlib import Path
39
+ from typing import Any, Literal
40
+
41
+ import httpx
42
+
43
+ try:
44
+ import uvicorn
45
+ from fastapi import FastAPI, HTTPException, Request, Response
46
+ from fastapi.middleware.cors import CORSMiddleware
47
+ from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
48
+
49
+ FASTAPI_AVAILABLE = True
50
+ except ImportError:
51
+ FASTAPI_AVAILABLE = False
52
+
53
+ # Add parent to path for imports
54
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
55
+
56
+ from headroom.cache.compression_feedback import get_compression_feedback
57
+ from headroom.cache.compression_store import get_compression_store
58
+ from headroom.ccr import (
59
+ CCR_TOOL_NAME,
60
+ CCRResponseHandler,
61
+ CCRToolInjector,
62
+ ContextTracker,
63
+ ContextTrackerConfig,
64
+ ResponseHandlerConfig,
65
+ parse_tool_call,
66
+ )
67
+ from headroom.config import CacheAlignerConfig, CCRConfig, RollingWindowConfig, SmartCrusherConfig
68
+ from headroom.providers import AnthropicProvider, OpenAIProvider
69
+ from headroom.telemetry import get_telemetry_collector
70
+ from headroom.tokenizers import get_tokenizer
71
+ from headroom.transforms import (
72
+ _LLMLINGUA_AVAILABLE,
73
+ CacheAligner,
74
+ CodeAwareCompressor,
75
+ CodeCompressorConfig,
76
+ ContentRouter,
77
+ ContentRouterConfig,
78
+ RollingWindow,
79
+ SmartCrusher,
80
+ TransformPipeline,
81
+ is_tree_sitter_available,
82
+ )
83
+
84
+ # Conditionally import LLMLingua if available
85
+ if _LLMLINGUA_AVAILABLE:
86
+ from headroom.transforms import LLMLinguaCompressor, LLMLinguaConfig
87
+
88
+ # Try to import LiteLLM for pricing
89
+ try:
90
+ import litellm
91
+
92
+ LITELLM_AVAILABLE = True
93
+ except ImportError:
94
+ LITELLM_AVAILABLE = False
95
+
96
+ logging.basicConfig(
97
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
98
+ )
99
+ logger = logging.getLogger("headroom.proxy")
100
+
101
+ # Maximum request body size (10MB)
102
+ MAX_REQUEST_BODY_SIZE = 10 * 1024 * 1024
103
+
104
+
105
+ # =============================================================================
106
+ # Data Models
107
+ # =============================================================================
108
+
109
+
110
+ @dataclass
111
+ class RequestLog:
112
+ """Complete log of a single request."""
113
+
114
+ request_id: str
115
+ timestamp: str
116
+ provider: str
117
+ model: str
118
+
119
+ # Tokens
120
+ input_tokens_original: int
121
+ input_tokens_optimized: int
122
+ output_tokens: int | None
123
+ tokens_saved: int
124
+ savings_percent: float
125
+
126
+ # Cost
127
+ estimated_cost_usd: float | None
128
+ estimated_savings_usd: float | None
129
+
130
+ # Performance
131
+ optimization_latency_ms: float
132
+ total_latency_ms: float | None
133
+
134
+ # Metadata
135
+ tags: dict[str, str]
136
+ cache_hit: bool
137
+ transforms_applied: list[str]
138
+
139
+ # Request/Response (optional, for debugging)
140
+ request_messages: list[dict] | None = None
141
+ response_content: str | None = None
142
+ error: str | None = None
143
+
144
+
145
+ @dataclass
146
+ class CacheEntry:
147
+ """Cached response entry."""
148
+
149
+ response_body: bytes
150
+ response_headers: dict[str, str]
151
+ created_at: datetime
152
+ ttl_seconds: int
153
+ hit_count: int = 0
154
+ tokens_saved_per_hit: int = 0
155
+
156
+
157
+ @dataclass
158
+ class RateLimitState:
159
+ """Token bucket rate limiter state."""
160
+
161
+ tokens: float
162
+ last_update: float
163
+
164
+
165
+ @dataclass
166
+ class ProxyConfig:
167
+ """Proxy configuration."""
168
+
169
+ # Server
170
+ host: str = "127.0.0.1"
171
+ port: int = 8787
172
+
173
+ # Optimization
174
+ optimize: bool = True
175
+ min_tokens_to_crush: int = 500
176
+ max_items_after_crush: int = 50
177
+ keep_last_turns: int = 4
178
+
179
+ # CCR Tool Injection
180
+ ccr_inject_tool: bool = True # Inject headroom_retrieve tool when compression occurs
181
+ ccr_inject_system_instructions: bool = False # Add instructions to system message
182
+
183
+ # CCR Response Handling (intercept and handle CCR tool calls automatically)
184
+ ccr_handle_responses: bool = True # Handle headroom_retrieve calls in responses
185
+ ccr_max_retrieval_rounds: int = 3 # Max rounds of retrieval before returning
186
+
187
+ # CCR Context Tracking (track compressed content across turns)
188
+ ccr_context_tracking: bool = True # Track compressed contexts for proactive expansion
189
+ ccr_proactive_expansion: bool = True # Proactively expand based on query relevance
190
+ ccr_max_proactive_expansions: int = 2 # Max contexts to proactively expand per turn
191
+
192
+ # LLMLingua ML-based compression (ON by default if installed)
193
+ llmlingua_enabled: bool = True # Enable LLMLingua-2 for ML-based compression
194
+ llmlingua_device: str = "auto" # Device: 'auto', 'cuda', 'cpu', 'mps'
195
+ llmlingua_target_rate: float = 0.3 # Target compression rate (0.3 = keep 30%)
196
+
197
+ # Code-aware compression (ON by default if installed)
198
+ code_aware_enabled: bool = True # Enable AST-based code compression
199
+
200
+ # Smart content routing (routes each message to optimal compressor)
201
+ smart_routing: bool = True # Use ContentRouter for intelligent compression
202
+
203
+ # Caching
204
+ cache_enabled: bool = True
205
+ cache_ttl_seconds: int = 3600 # 1 hour
206
+ cache_max_entries: int = 1000
207
+
208
+ # Rate limiting
209
+ rate_limit_enabled: bool = True
210
+ rate_limit_requests_per_minute: int = 60
211
+ rate_limit_tokens_per_minute: int = 100000
212
+
213
+ # Retry
214
+ retry_enabled: bool = True
215
+ retry_max_attempts: int = 3
216
+ retry_base_delay_ms: int = 1000
217
+ retry_max_delay_ms: int = 30000
218
+
219
+ # Cost tracking
220
+ cost_tracking_enabled: bool = True
221
+ budget_limit_usd: float | None = None # None = unlimited
222
+ budget_period: Literal["hourly", "daily", "monthly"] = "daily"
223
+
224
+ # Logging
225
+ log_requests: bool = True
226
+ log_file: str | None = None
227
+ log_full_messages: bool = False # Privacy: don't log content by default
228
+
229
+ # Fallback
230
+ fallback_enabled: bool = False
231
+ fallback_provider: str | None = None # "openai" or "anthropic"
232
+
233
+ # Timeouts
234
+ request_timeout_seconds: int = 300
235
+ connect_timeout_seconds: int = 10
236
+
237
+
238
+ # =============================================================================
239
+ # Caching
240
+ # =============================================================================
241
+
242
+
243
+ class SemanticCache:
244
+ """Simple semantic cache based on message content hash.
245
+
246
+ Uses OrderedDict for O(1) LRU eviction instead of list with O(n) pop(0).
247
+ """
248
+
249
+ def __init__(self, max_entries: int = 1000, ttl_seconds: int = 3600):
250
+ self.max_entries = max_entries
251
+ self.ttl_seconds = ttl_seconds
252
+ # OrderedDict maintains insertion order and supports O(1) move_to_end/popitem
253
+ self._cache: OrderedDict[str, CacheEntry] = OrderedDict()
254
+ self._lock = asyncio.Lock()
255
+
256
+ def _compute_key(self, messages: list[dict], model: str) -> str:
257
+ """Compute cache key from messages and model."""
258
+ # Normalize messages for consistent hashing
259
+ normalized = json.dumps(
260
+ {
261
+ "model": model,
262
+ "messages": messages,
263
+ },
264
+ sort_keys=True,
265
+ )
266
+ return hashlib.sha256(normalized.encode()).hexdigest()[:32]
267
+
268
+ async def get(self, messages: list[dict], model: str) -> CacheEntry | None:
269
+ """Get cached response if exists and not expired."""
270
+ key = self._compute_key(messages, model)
271
+ async with self._lock:
272
+ entry = self._cache.get(key)
273
+
274
+ if entry is None:
275
+ return None
276
+
277
+ # Check expiration
278
+ age = (datetime.now() - entry.created_at).total_seconds()
279
+ if age > entry.ttl_seconds:
280
+ del self._cache[key]
281
+ return None
282
+
283
+ entry.hit_count += 1
284
+ # Move to end for LRU (O(1) operation)
285
+ self._cache.move_to_end(key)
286
+ return entry
287
+
288
+ async def set(
289
+ self,
290
+ messages: list[dict],
291
+ model: str,
292
+ response_body: bytes,
293
+ response_headers: dict[str, str],
294
+ tokens_saved: int = 0,
295
+ ):
296
+ """Cache a response."""
297
+ key = self._compute_key(messages, model)
298
+
299
+ async with self._lock:
300
+ # If key already exists, remove it first to update position
301
+ if key in self._cache:
302
+ del self._cache[key]
303
+
304
+ # Evict oldest entries if at capacity (LRU) - O(1) with popitem
305
+ while len(self._cache) >= self.max_entries:
306
+ self._cache.popitem(last=False) # Remove oldest (first) entry
307
+
308
+ self._cache[key] = CacheEntry(
309
+ response_body=response_body,
310
+ response_headers=response_headers,
311
+ created_at=datetime.now(),
312
+ ttl_seconds=self.ttl_seconds,
313
+ tokens_saved_per_hit=tokens_saved,
314
+ )
315
+
316
+ async def stats(self) -> dict:
317
+ """Get cache statistics."""
318
+ async with self._lock:
319
+ total_hits = sum(e.hit_count for e in self._cache.values())
320
+ return {
321
+ "entries": len(self._cache),
322
+ "max_entries": self.max_entries,
323
+ "total_hits": total_hits,
324
+ "ttl_seconds": self.ttl_seconds,
325
+ }
326
+
327
+ async def clear(self):
328
+ """Clear all cache entries."""
329
+ async with self._lock:
330
+ self._cache.clear()
331
+
332
+
333
+ # =============================================================================
334
+ # Rate Limiting
335
+ # =============================================================================
336
+
337
+
338
+ class TokenBucketRateLimiter:
339
+ """Token bucket rate limiter for requests and tokens."""
340
+
341
+ def __init__(
342
+ self,
343
+ requests_per_minute: int = 60,
344
+ tokens_per_minute: int = 100000,
345
+ ):
346
+ self.requests_per_minute = requests_per_minute
347
+ self.tokens_per_minute = tokens_per_minute
348
+
349
+ # Per-key buckets (key = API key or IP)
350
+ self._request_buckets: dict[str, RateLimitState] = defaultdict(
351
+ lambda: RateLimitState(tokens=requests_per_minute, last_update=time.time())
352
+ )
353
+ self._token_buckets: dict[str, RateLimitState] = defaultdict(
354
+ lambda: RateLimitState(tokens=tokens_per_minute, last_update=time.time())
355
+ )
356
+ self._lock = asyncio.Lock()
357
+
358
+ def _refill(self, state: RateLimitState, rate_per_minute: float) -> float:
359
+ """Refill bucket based on elapsed time."""
360
+ now = time.time()
361
+ elapsed = now - state.last_update
362
+ refill = elapsed * (rate_per_minute / 60.0)
363
+ state.tokens = min(rate_per_minute, state.tokens + refill)
364
+ state.last_update = now
365
+ return state.tokens
366
+
367
+ async def check_request(self, key: str = "default") -> tuple[bool, float]:
368
+ """Check if request is allowed. Returns (allowed, wait_seconds)."""
369
+ async with self._lock:
370
+ state = self._request_buckets[key]
371
+ available = self._refill(state, self.requests_per_minute)
372
+
373
+ if available >= 1:
374
+ state.tokens -= 1
375
+ return True, 0
376
+
377
+ wait_seconds = (1 - available) * (60.0 / self.requests_per_minute)
378
+ return False, wait_seconds
379
+
380
+ async def check_tokens(self, key: str, token_count: int) -> tuple[bool, float]:
381
+ """Check if token usage is allowed."""
382
+ async with self._lock:
383
+ state = self._token_buckets[key]
384
+ available = self._refill(state, self.tokens_per_minute)
385
+
386
+ if available >= token_count:
387
+ state.tokens -= token_count
388
+ return True, 0
389
+
390
+ wait_seconds = (token_count - available) * (60.0 / self.tokens_per_minute)
391
+ return False, wait_seconds
392
+
393
+ async def stats(self) -> dict:
394
+ """Get rate limiter statistics."""
395
+ async with self._lock:
396
+ return {
397
+ "requests_per_minute": self.requests_per_minute,
398
+ "tokens_per_minute": self.tokens_per_minute,
399
+ "active_keys": len(self._request_buckets),
400
+ }
401
+
402
+
403
+ # =============================================================================
404
+ # Cost Tracking
405
+ # =============================================================================
406
+
407
+
408
+ class CostTracker:
409
+ """Track costs and enforce budgets.
410
+
411
+ Cost history is automatically pruned to prevent unbounded memory growth:
412
+ - Entries older than 24 hours are removed
413
+ - Maximum of 100,000 entries are kept
414
+ """
415
+
416
+ # Fallback pricing - LiteLLM is preferred source
417
+ # Pricing per 1M tokens (input, output, cached_input)
418
+ PRICING = {
419
+ # Anthropic
420
+ "claude-3-5-sonnet": (3.00, 15.00, 0.30),
421
+ "claude-3-5-haiku": (0.80, 4.00, 0.08),
422
+ "claude-3-opus": (15.00, 75.00, 1.50),
423
+ "claude-sonnet-4": (3.00, 15.00, 0.30),
424
+ "claude-opus-4": (15.00, 75.00, 1.50),
425
+ # OpenAI
426
+ "gpt-4o": (2.50, 10.00, 1.25),
427
+ "gpt-4o-mini": (0.15, 0.60, 0.075),
428
+ "o1": (15.00, 60.00, 7.50),
429
+ "o1-mini": (1.10, 4.40, 0.55),
430
+ "o3-mini": (1.10, 4.40, 0.55),
431
+ "gpt-4-turbo": (10.00, 30.00, 5.00),
432
+ }
433
+
434
+ MAX_COST_ENTRIES = 100_000
435
+ COST_RETENTION_HOURS = 24
436
+
437
+ def __init__(self, budget_limit_usd: float | None = None, budget_period: str = "daily"):
438
+ self.budget_limit_usd = budget_limit_usd
439
+ self.budget_period = budget_period
440
+
441
+ # Cost tracking - using deque for efficient left-side removal
442
+ self._costs: deque[tuple[datetime, float]] = deque(maxlen=self.MAX_COST_ENTRIES)
443
+ self._total_cost_usd: float = 0
444
+ self._total_savings_usd: float = 0
445
+ self._last_prune_time: datetime = datetime.now()
446
+
447
+ def _get_pricing(self, model: str) -> tuple[float, float, float] | None:
448
+ """Get pricing for model."""
449
+ model_lower = model.lower()
450
+ for prefix, pricing in self.PRICING.items():
451
+ if prefix in model_lower:
452
+ return pricing
453
+ return None
454
+
455
+ def estimate_cost(
456
+ self,
457
+ model: str,
458
+ input_tokens: int,
459
+ output_tokens: int,
460
+ cached_tokens: int = 0,
461
+ ) -> float | None:
462
+ """Estimate cost in USD."""
463
+ # Try LiteLLM first
464
+ if LITELLM_AVAILABLE:
465
+ try:
466
+ cost = litellm.completion_cost(
467
+ model=model,
468
+ prompt_tokens=input_tokens,
469
+ completion_tokens=output_tokens,
470
+ )
471
+ if cost is not None and cost > 0:
472
+ return float(cost)
473
+ except Exception:
474
+ pass
475
+
476
+ # Fall back to hardcoded pricing
477
+ pricing = self._get_pricing(model)
478
+ if pricing is None:
479
+ return None
480
+
481
+ input_price, output_price, cached_price = pricing
482
+
483
+ regular_input = input_tokens - cached_tokens
484
+ cost = (
485
+ (regular_input / 1_000_000) * input_price
486
+ + (cached_tokens / 1_000_000) * cached_price
487
+ + (output_tokens / 1_000_000) * output_price
488
+ )
489
+ return cost
490
+
491
+ def _prune_old_costs(self):
492
+ """Remove cost entries older than retention period.
493
+
494
+ Called periodically (every 5 minutes) to prevent unbounded memory growth.
495
+ The deque maxlen provides a hard cap, but time-based pruning keeps
496
+ memory usage proportional to actual traffic patterns.
497
+ """
498
+ now = datetime.now()
499
+ # Only prune every 5 minutes to avoid overhead
500
+ if (now - self._last_prune_time).total_seconds() < 300:
501
+ return
502
+
503
+ self._last_prune_time = now
504
+ cutoff = now - timedelta(hours=self.COST_RETENTION_HOURS)
505
+
506
+ # Remove entries from the left (oldest) while they're older than cutoff
507
+ while self._costs and self._costs[0][0] < cutoff:
508
+ self._costs.popleft()
509
+
510
+ def record_cost(self, cost_usd: float):
511
+ """Record a cost. Periodically prunes old entries."""
512
+ self._costs.append((datetime.now(), cost_usd))
513
+ self._total_cost_usd += cost_usd
514
+ # Periodically prune old costs to prevent memory growth
515
+ self._prune_old_costs()
516
+
517
+ def record_savings(self, savings_usd: float):
518
+ """Record savings from optimization."""
519
+ self._total_savings_usd += savings_usd
520
+
521
+ def get_period_cost(self) -> float:
522
+ """Get cost for current budget period."""
523
+ now = datetime.now()
524
+
525
+ if self.budget_period == "hourly":
526
+ cutoff = now - timedelta(hours=1)
527
+ elif self.budget_period == "daily":
528
+ cutoff = now.replace(hour=0, minute=0, second=0, microsecond=0)
529
+ else: # monthly
530
+ cutoff = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
531
+
532
+ return sum(cost for ts, cost in self._costs if ts >= cutoff)
533
+
534
+ def check_budget(self) -> tuple[bool, float]:
535
+ """Check if within budget. Returns (allowed, remaining)."""
536
+ if self.budget_limit_usd is None:
537
+ return True, float("inf")
538
+
539
+ period_cost = self.get_period_cost()
540
+ remaining = self.budget_limit_usd - period_cost
541
+ return remaining > 0, max(0, remaining)
542
+
543
+ def stats(self) -> dict:
544
+ """Get cost statistics."""
545
+ return {
546
+ "total_cost_usd": round(self._total_cost_usd, 4),
547
+ "total_savings_usd": round(self._total_savings_usd, 4),
548
+ "period_cost_usd": round(self.get_period_cost(), 4),
549
+ "budget_limit_usd": self.budget_limit_usd,
550
+ "budget_period": self.budget_period,
551
+ "budget_remaining_usd": round(self.check_budget()[1], 4)
552
+ if self.budget_limit_usd
553
+ else None,
554
+ }
555
+
556
+
557
+ # =============================================================================
558
+ # Prometheus Metrics
559
+ # =============================================================================
560
+
561
+
562
+ class PrometheusMetrics:
563
+ """Prometheus-compatible metrics."""
564
+
565
+ def __init__(self):
566
+ self.requests_total = 0
567
+ self.requests_by_provider: dict[str, int] = defaultdict(int)
568
+ self.requests_by_model: dict[str, int] = defaultdict(int)
569
+ self.requests_cached = 0
570
+ self.requests_rate_limited = 0
571
+ self.requests_failed = 0
572
+
573
+ self.tokens_input_total = 0
574
+ self.tokens_output_total = 0
575
+ self.tokens_saved_total = 0
576
+
577
+ self.latency_sum_ms = 0.0
578
+ self.latency_count = 0
579
+
580
+ self.cost_total_usd = 0.0
581
+ self.savings_total_usd = 0.0
582
+
583
+ self._lock = asyncio.Lock()
584
+
585
+ async def record_request(
586
+ self,
587
+ provider: str,
588
+ model: str,
589
+ input_tokens: int,
590
+ output_tokens: int,
591
+ tokens_saved: int,
592
+ latency_ms: float,
593
+ cached: bool = False,
594
+ cost_usd: float = 0,
595
+ savings_usd: float = 0,
596
+ ):
597
+ """Record metrics for a request."""
598
+ async with self._lock:
599
+ self.requests_total += 1
600
+ self.requests_by_provider[provider] += 1
601
+ self.requests_by_model[model] += 1
602
+
603
+ if cached:
604
+ self.requests_cached += 1
605
+
606
+ self.tokens_input_total += input_tokens
607
+ self.tokens_output_total += output_tokens
608
+ self.tokens_saved_total += tokens_saved
609
+
610
+ self.latency_sum_ms += latency_ms
611
+ self.latency_count += 1
612
+
613
+ self.cost_total_usd += cost_usd
614
+ self.savings_total_usd += savings_usd
615
+
616
+ async def record_rate_limited(self):
617
+ async with self._lock:
618
+ self.requests_rate_limited += 1
619
+
620
+ async def record_failed(self):
621
+ async with self._lock:
622
+ self.requests_failed += 1
623
+
624
+ async def export(self) -> str:
625
+ """Export metrics in Prometheus format."""
626
+ async with self._lock:
627
+ lines = [
628
+ "# HELP headroom_requests_total Total number of requests",
629
+ "# TYPE headroom_requests_total counter",
630
+ f"headroom_requests_total {self.requests_total}",
631
+ "",
632
+ "# HELP headroom_requests_cached_total Cached request count",
633
+ "# TYPE headroom_requests_cached_total counter",
634
+ f"headroom_requests_cached_total {self.requests_cached}",
635
+ "",
636
+ "# HELP headroom_requests_rate_limited_total Rate limited requests",
637
+ "# TYPE headroom_requests_rate_limited_total counter",
638
+ f"headroom_requests_rate_limited_total {self.requests_rate_limited}",
639
+ "",
640
+ "# HELP headroom_requests_failed_total Failed requests",
641
+ "# TYPE headroom_requests_failed_total counter",
642
+ f"headroom_requests_failed_total {self.requests_failed}",
643
+ "",
644
+ "# HELP headroom_tokens_input_total Total input tokens",
645
+ "# TYPE headroom_tokens_input_total counter",
646
+ f"headroom_tokens_input_total {self.tokens_input_total}",
647
+ "",
648
+ "# HELP headroom_tokens_output_total Total output tokens",
649
+ "# TYPE headroom_tokens_output_total counter",
650
+ f"headroom_tokens_output_total {self.tokens_output_total}",
651
+ "",
652
+ "# HELP headroom_tokens_saved_total Tokens saved by optimization",
653
+ "# TYPE headroom_tokens_saved_total counter",
654
+ f"headroom_tokens_saved_total {self.tokens_saved_total}",
655
+ "",
656
+ "# HELP headroom_latency_ms_sum Sum of request latencies",
657
+ "# TYPE headroom_latency_ms_sum counter",
658
+ f"headroom_latency_ms_sum {self.latency_sum_ms:.2f}",
659
+ "",
660
+ "# HELP headroom_cost_usd_total Total cost in USD",
661
+ "# TYPE headroom_cost_usd_total counter",
662
+ f"headroom_cost_usd_total {self.cost_total_usd:.6f}",
663
+ "",
664
+ "# HELP headroom_savings_usd_total Total savings in USD",
665
+ "# TYPE headroom_savings_usd_total counter",
666
+ f"headroom_savings_usd_total {self.savings_total_usd:.6f}",
667
+ ]
668
+
669
+ # Per-provider metrics
670
+ lines.extend(
671
+ [
672
+ "",
673
+ "# HELP headroom_requests_by_provider Requests by provider",
674
+ "# TYPE headroom_requests_by_provider counter",
675
+ ]
676
+ )
677
+ for provider, count in self.requests_by_provider.items():
678
+ lines.append(f'headroom_requests_by_provider{{provider="{provider}"}} {count}')
679
+
680
+ # Per-model metrics
681
+ lines.extend(
682
+ [
683
+ "",
684
+ "# HELP headroom_requests_by_model Requests by model",
685
+ "# TYPE headroom_requests_by_model counter",
686
+ ]
687
+ )
688
+ for model, count in self.requests_by_model.items():
689
+ lines.append(f'headroom_requests_by_model{{model="{model}"}} {count}')
690
+
691
+ return "\n".join(lines)
692
+
693
+
694
+ # =============================================================================
695
+ # Request Logger
696
+ # =============================================================================
697
+
698
+
699
+ class RequestLogger:
700
+ """Log requests to JSONL file.
701
+
702
+ Uses a deque with max 10,000 entries to prevent unbounded memory growth.
703
+ """
704
+
705
+ MAX_LOG_ENTRIES = 10_000
706
+
707
+ def __init__(self, log_file: str | None = None, log_full_messages: bool = False):
708
+ self.log_file = Path(log_file) if log_file else None
709
+ self.log_full_messages = log_full_messages
710
+ # Use deque with maxlen for automatic FIFO eviction
711
+ self._logs: deque[RequestLog] = deque(maxlen=self.MAX_LOG_ENTRIES)
712
+
713
+ if self.log_file:
714
+ self.log_file.parent.mkdir(parents=True, exist_ok=True)
715
+
716
+ def log(self, entry: RequestLog):
717
+ """Log a request. Oldest entries are automatically removed when limit reached."""
718
+ self._logs.append(entry)
719
+
720
+ if self.log_file:
721
+ with open(self.log_file, "a") as f:
722
+ log_dict = asdict(entry)
723
+ if not self.log_full_messages:
724
+ log_dict.pop("request_messages", None)
725
+ log_dict.pop("response_content", None)
726
+ f.write(json.dumps(log_dict) + "\n")
727
+
728
+ def get_recent(self, n: int = 100) -> list[dict]:
729
+ """Get recent log entries."""
730
+ # Convert deque to list for slicing (deque doesn't support slicing)
731
+ entries = list(self._logs)[-n:]
732
+ return [
733
+ {
734
+ k: v
735
+ for k, v in asdict(e).items()
736
+ if k not in ("request_messages", "response_content")
737
+ }
738
+ for e in entries
739
+ ]
740
+
741
+ def stats(self) -> dict:
742
+ """Get logging statistics."""
743
+ return {
744
+ "total_logged": len(self._logs),
745
+ "log_file": str(self.log_file) if self.log_file else None,
746
+ }
747
+
748
+
749
+ # =============================================================================
750
+ # Main Proxy
751
+ # =============================================================================
752
+
753
+
754
+ class HeadroomProxy:
755
+ """Production-ready Headroom optimization proxy."""
756
+
757
+ ANTHROPIC_API_URL = "https://api.anthropic.com"
758
+ OPENAI_API_URL = "https://api.openai.com"
759
+
760
+ def __init__(self, config: ProxyConfig):
761
+ self.config = config
762
+
763
+ # Initialize providers
764
+ self.anthropic_provider = AnthropicProvider()
765
+ self.openai_provider = OpenAIProvider()
766
+
767
+ # Initialize transforms based on routing mode
768
+ if config.smart_routing:
769
+ # Smart routing: ContentRouter handles all content types intelligently
770
+ # It lazy-loads compressors (including LLMLingua) only when needed
771
+ router_config = ContentRouterConfig(
772
+ enable_llmlingua=config.llmlingua_enabled,
773
+ enable_code_aware=config.code_aware_enabled,
774
+ )
775
+ transforms = [
776
+ CacheAligner(CacheAlignerConfig(enabled=True)),
777
+ ContentRouter(router_config),
778
+ RollingWindow(
779
+ RollingWindowConfig(
780
+ enabled=True,
781
+ keep_system=True,
782
+ keep_last_turns=config.keep_last_turns,
783
+ )
784
+ ),
785
+ ]
786
+ self._llmlingua_status = "lazy" if config.llmlingua_enabled else "disabled"
787
+ self._code_aware_status = "lazy" if config.code_aware_enabled else "disabled"
788
+ else:
789
+ # Legacy mode: sequential pipeline
790
+ transforms = [
791
+ CacheAligner(CacheAlignerConfig(enabled=True)),
792
+ SmartCrusher(
793
+ SmartCrusherConfig( # type: ignore[arg-type]
794
+ enabled=True,
795
+ min_tokens_to_crush=config.min_tokens_to_crush,
796
+ max_items_after_crush=config.max_items_after_crush,
797
+ ),
798
+ ccr_config=CCRConfig(
799
+ enabled=config.ccr_inject_tool,
800
+ inject_retrieval_marker=config.ccr_inject_tool, # Add CCR markers
801
+ ),
802
+ ),
803
+ RollingWindow(
804
+ RollingWindowConfig(
805
+ enabled=True,
806
+ keep_system=True,
807
+ keep_last_turns=config.keep_last_turns,
808
+ )
809
+ ),
810
+ ]
811
+ # Add LLMLingua if enabled and available
812
+ self._llmlingua_status = self._setup_llmlingua(config, transforms)
813
+ # Add CodeAware if enabled and available
814
+ self._code_aware_status = self._setup_code_aware(config, transforms)
815
+
816
+ self.anthropic_pipeline = TransformPipeline(
817
+ transforms=transforms,
818
+ provider=self.anthropic_provider,
819
+ )
820
+ self.openai_pipeline = TransformPipeline(
821
+ transforms=transforms,
822
+ provider=self.openai_provider,
823
+ )
824
+
825
+ # Initialize components
826
+ self.cache = (
827
+ SemanticCache(
828
+ max_entries=config.cache_max_entries,
829
+ ttl_seconds=config.cache_ttl_seconds,
830
+ )
831
+ if config.cache_enabled
832
+ else None
833
+ )
834
+
835
+ self.rate_limiter = (
836
+ TokenBucketRateLimiter(
837
+ requests_per_minute=config.rate_limit_requests_per_minute,
838
+ tokens_per_minute=config.rate_limit_tokens_per_minute,
839
+ )
840
+ if config.rate_limit_enabled
841
+ else None
842
+ )
843
+
844
+ self.cost_tracker = (
845
+ CostTracker(
846
+ budget_limit_usd=config.budget_limit_usd,
847
+ budget_period=config.budget_period,
848
+ )
849
+ if config.cost_tracking_enabled
850
+ else None
851
+ )
852
+
853
+ self.metrics = PrometheusMetrics()
854
+
855
+ self.logger = (
856
+ RequestLogger(
857
+ log_file=config.log_file,
858
+ log_full_messages=config.log_full_messages,
859
+ )
860
+ if config.log_requests
861
+ else None
862
+ )
863
+
864
+ # HTTP client
865
+ self.http_client: httpx.AsyncClient | None = None
866
+
867
+ # Request counter for IDs
868
+ self._request_counter = 0
869
+ self._request_counter_lock = asyncio.Lock()
870
+
871
+ # CCR tool injectors (one per provider)
872
+ self.anthropic_tool_injector = CCRToolInjector(
873
+ provider="anthropic",
874
+ inject_tool=config.ccr_inject_tool,
875
+ inject_system_instructions=config.ccr_inject_system_instructions,
876
+ )
877
+ self.openai_tool_injector = CCRToolInjector(
878
+ provider="openai",
879
+ inject_tool=config.ccr_inject_tool,
880
+ inject_system_instructions=config.ccr_inject_system_instructions,
881
+ )
882
+
883
+ # CCR Response Handler (handles CCR tool calls automatically)
884
+ self.ccr_response_handler = (
885
+ CCRResponseHandler(
886
+ ResponseHandlerConfig(
887
+ enabled=True,
888
+ max_retrieval_rounds=config.ccr_max_retrieval_rounds,
889
+ )
890
+ )
891
+ if config.ccr_handle_responses
892
+ else None
893
+ )
894
+
895
+ # CCR Context Tracker (tracks compressed content across turns)
896
+ self.ccr_context_tracker = (
897
+ ContextTracker(
898
+ ContextTrackerConfig(
899
+ enabled=True,
900
+ proactive_expansion=config.ccr_proactive_expansion,
901
+ max_proactive_expansions=config.ccr_max_proactive_expansions,
902
+ )
903
+ )
904
+ if config.ccr_context_tracking
905
+ else None
906
+ )
907
+
908
+ # Turn counter for context tracking
909
+ self._turn_counter = 0
910
+
911
+ def _setup_llmlingua(self, config: ProxyConfig, transforms: list) -> str:
912
+ """Set up LLMLingua compression if enabled.
913
+
914
+ Args:
915
+ config: Proxy configuration
916
+ transforms: Transform list to append to
917
+
918
+ Returns:
919
+ Status string for logging: 'enabled', 'disabled', 'available', 'unavailable'
920
+ """
921
+ if config.llmlingua_enabled:
922
+ if _LLMLINGUA_AVAILABLE:
923
+ llmlingua_config = LLMLinguaConfig(
924
+ device=config.llmlingua_device,
925
+ target_compression_rate=config.llmlingua_target_rate,
926
+ enable_ccr=config.ccr_inject_tool, # Link to CCR
927
+ )
928
+ # Insert before RollingWindow (which should be last)
929
+ # LLMLingua works best on individual tool outputs before windowing
930
+ transforms.insert(-1, LLMLinguaCompressor(llmlingua_config))
931
+ return "enabled"
932
+ else:
933
+ logger.warning(
934
+ "LLMLingua requested but not installed. "
935
+ "Install with: pip install headroom-ai[llmlingua]"
936
+ )
937
+ return "unavailable"
938
+ else:
939
+ if _LLMLINGUA_AVAILABLE:
940
+ return "available" # Available but not enabled - hint to user
941
+ return "disabled"
942
+
943
+ def _setup_code_aware(self, config: ProxyConfig, transforms: list) -> str:
944
+ """Set up code-aware compression if enabled.
945
+
946
+ Args:
947
+ config: Proxy configuration
948
+ transforms: Transform list to append to
949
+
950
+ Returns:
951
+ Status string for logging: 'enabled', 'disabled', 'available', 'unavailable'
952
+ """
953
+ if config.code_aware_enabled:
954
+ if is_tree_sitter_available():
955
+ code_config = CodeCompressorConfig(
956
+ preserve_imports=True,
957
+ preserve_signatures=True,
958
+ preserve_type_annotations=True,
959
+ preserve_error_handlers=True,
960
+ )
961
+ # Insert before RollingWindow (which should be last)
962
+ transforms.insert(-1, CodeAwareCompressor(code_config))
963
+ return "enabled"
964
+ else:
965
+ logger.warning(
966
+ "Code-aware compression requested but tree-sitter not installed. "
967
+ "Install with: pip install headroom-ai[code]"
968
+ )
969
+ return "unavailable"
970
+ else:
971
+ if is_tree_sitter_available():
972
+ return "available" # Available but not enabled
973
+ return "disabled"
974
+
975
+ async def startup(self):
976
+ """Initialize async resources."""
977
+ self.http_client = httpx.AsyncClient(
978
+ timeout=httpx.Timeout(
979
+ connect=self.config.connect_timeout_seconds,
980
+ read=self.config.request_timeout_seconds,
981
+ write=self.config.request_timeout_seconds,
982
+ pool=self.config.connect_timeout_seconds,
983
+ )
984
+ )
985
+ logger.info("Headroom Proxy started")
986
+ logger.info(f"Optimization: {'ENABLED' if self.config.optimize else 'DISABLED'}")
987
+ logger.info(f"Caching: {'ENABLED' if self.config.cache_enabled else 'DISABLED'}")
988
+ logger.info(f"Rate Limiting: {'ENABLED' if self.config.rate_limit_enabled else 'DISABLED'}")
989
+
990
+ # Smart routing status
991
+ if self.config.smart_routing:
992
+ logger.info("Smart Routing: ENABLED (intelligent content detection)")
993
+ else:
994
+ logger.info("Smart Routing: DISABLED (legacy sequential mode)")
995
+
996
+ # LLMLingua status with helpful hint
997
+ if self._llmlingua_status == "enabled":
998
+ logger.info(
999
+ f"LLMLingua: ENABLED (device={self.config.llmlingua_device}, "
1000
+ f"rate={self.config.llmlingua_target_rate})"
1001
+ )
1002
+ elif self._llmlingua_status == "lazy":
1003
+ logger.info("LLMLingua: LAZY (will load when prose content detected)")
1004
+ elif self._llmlingua_status == "available":
1005
+ logger.info("LLMLingua: available but disabled (use --llmlingua)")
1006
+ elif self._llmlingua_status == "unavailable":
1007
+ logger.info("LLMLingua: not installed (pip install headroom-ai[llmlingua])")
1008
+ elif self._llmlingua_status == "disabled":
1009
+ logger.info("LLMLingua: DISABLED")
1010
+
1011
+ # Code-aware status
1012
+ if self._code_aware_status == "enabled":
1013
+ logger.info("Code-Aware: ENABLED (AST-based compression)")
1014
+ elif self._code_aware_status == "lazy":
1015
+ logger.info("Code-Aware: LAZY (will load when code content detected)")
1016
+ elif self._code_aware_status == "available":
1017
+ logger.info("Code-Aware: available but disabled (use --code-aware)")
1018
+ elif self._code_aware_status == "unavailable":
1019
+ logger.info("Code-Aware: not installed (pip install headroom-ai[code])")
1020
+ elif self._code_aware_status == "disabled":
1021
+ logger.info("Code-Aware: DISABLED")
1022
+
1023
+ # CCR status
1024
+ ccr_features = []
1025
+ if self.config.ccr_inject_tool:
1026
+ ccr_features.append("tool_injection")
1027
+ if self.config.ccr_handle_responses:
1028
+ ccr_features.append("response_handling")
1029
+ if self.config.ccr_context_tracking:
1030
+ ccr_features.append("context_tracking")
1031
+ if self.config.ccr_proactive_expansion:
1032
+ ccr_features.append("proactive_expansion")
1033
+ if ccr_features:
1034
+ logger.info(f"CCR (Compress-Cache-Retrieve): ENABLED ({', '.join(ccr_features)})")
1035
+ else:
1036
+ logger.info("CCR: DISABLED")
1037
+
1038
+ async def shutdown(self):
1039
+ """Cleanup async resources."""
1040
+ if self.http_client:
1041
+ await self.http_client.aclose()
1042
+
1043
+ # Print final stats
1044
+ self._print_summary()
1045
+
1046
+ def _print_summary(self):
1047
+ """Print session summary."""
1048
+ m = self.metrics
1049
+ logger.info("=" * 70)
1050
+ logger.info("HEADROOM PROXY SESSION SUMMARY")
1051
+ logger.info("=" * 70)
1052
+ logger.info(f"Total requests: {m.requests_total}")
1053
+ logger.info(f"Cached responses: {m.requests_cached}")
1054
+ logger.info(f"Rate limited: {m.requests_rate_limited}")
1055
+ logger.info(f"Failed: {m.requests_failed}")
1056
+ logger.info(f"Input tokens: {m.tokens_input_total:,}")
1057
+ logger.info(f"Output tokens: {m.tokens_output_total:,}")
1058
+ logger.info(f"Tokens saved: {m.tokens_saved_total:,}")
1059
+ if m.tokens_input_total > 0:
1060
+ savings_pct = (
1061
+ m.tokens_saved_total / (m.tokens_input_total + m.tokens_saved_total)
1062
+ ) * 100
1063
+ logger.info(f"Token savings: {savings_pct:.1f}%")
1064
+ logger.info(f"Total cost: ${m.cost_total_usd:.4f}")
1065
+ logger.info(f"Total savings: ${m.savings_total_usd:.4f}")
1066
+ if m.latency_count > 0:
1067
+ avg_latency = m.latency_sum_ms / m.latency_count
1068
+ logger.info(f"Avg latency: {avg_latency:.0f}ms")
1069
+ logger.info("=" * 70)
1070
+
1071
+ async def _next_request_id(self) -> str:
1072
+ """Generate unique request ID."""
1073
+ async with self._request_counter_lock:
1074
+ self._request_counter += 1
1075
+ return f"hr_{int(time.time())}_{self._request_counter:06d}"
1076
+
1077
+ def _extract_tags(self, headers: dict) -> dict[str, str]:
1078
+ """Extract Headroom tags from headers."""
1079
+ tags = {}
1080
+ for key, value in headers.items():
1081
+ if key.lower().startswith("x-headroom-"):
1082
+ tag_name = key.lower().replace("x-headroom-", "")
1083
+ tags[tag_name] = value
1084
+ return tags
1085
+
1086
+ async def _retry_request(
1087
+ self,
1088
+ method: str,
1089
+ url: str,
1090
+ headers: dict,
1091
+ body: dict,
1092
+ stream: bool = False,
1093
+ ) -> httpx.Response:
1094
+ """Make request with retry and exponential backoff."""
1095
+ last_error = None
1096
+
1097
+ for attempt in range(self.config.retry_max_attempts):
1098
+ try:
1099
+ if stream:
1100
+ # For streaming, we return early - retry happens at higher level
1101
+ return await self.http_client.post(url, json=body, headers=headers) # type: ignore[union-attr]
1102
+ else:
1103
+ response = await self.http_client.post(url, json=body, headers=headers) # type: ignore[union-attr]
1104
+
1105
+ # Don't retry client errors (4xx)
1106
+ if 400 <= response.status_code < 500:
1107
+ return response
1108
+
1109
+ # Retry server errors (5xx)
1110
+ if response.status_code >= 500:
1111
+ raise httpx.HTTPStatusError(
1112
+ f"Server error: {response.status_code}",
1113
+ request=response.request,
1114
+ response=response,
1115
+ )
1116
+
1117
+ return response
1118
+
1119
+ except (httpx.ConnectError, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
1120
+ last_error = e
1121
+
1122
+ if not self.config.retry_enabled or attempt >= self.config.retry_max_attempts - 1:
1123
+ raise
1124
+
1125
+ # Exponential backoff with jitter
1126
+ delay = min(
1127
+ self.config.retry_base_delay_ms * (2**attempt),
1128
+ self.config.retry_max_delay_ms,
1129
+ )
1130
+ delay_with_jitter = delay * (0.5 + random.random())
1131
+
1132
+ logger.warning(
1133
+ f"Request failed (attempt {attempt + 1}), retrying in {delay_with_jitter:.0f}ms: {e}"
1134
+ )
1135
+ await asyncio.sleep(delay_with_jitter / 1000)
1136
+
1137
+ raise last_error # type: ignore[misc]
1138
+
1139
+ async def handle_anthropic_messages(
1140
+ self,
1141
+ request: Request,
1142
+ ) -> Response | StreamingResponse:
1143
+ """Handle Anthropic /v1/messages endpoint."""
1144
+ start_time = time.time()
1145
+ request_id = await self._next_request_id()
1146
+
1147
+ # Check request body size
1148
+ content_length = request.headers.get("content-length")
1149
+ if content_length and int(content_length) > MAX_REQUEST_BODY_SIZE:
1150
+ return JSONResponse(
1151
+ status_code=413,
1152
+ content={
1153
+ "type": "error",
1154
+ "error": {
1155
+ "type": "request_too_large",
1156
+ "message": f"Request body too large. Maximum size is {MAX_REQUEST_BODY_SIZE // (1024 * 1024)}MB",
1157
+ },
1158
+ },
1159
+ )
1160
+
1161
+ # Parse request
1162
+ try:
1163
+ body = await request.json()
1164
+ except json.JSONDecodeError as e:
1165
+ return JSONResponse(
1166
+ status_code=400,
1167
+ content={
1168
+ "type": "error",
1169
+ "error": {
1170
+ "type": "invalid_request_error",
1171
+ "message": f"Invalid JSON in request body: {e!s}",
1172
+ },
1173
+ },
1174
+ )
1175
+ model = body.get("model", "unknown")
1176
+ messages = body.get("messages", [])
1177
+ stream = body.get("stream", False)
1178
+
1179
+ # Extract headers and tags
1180
+ headers = dict(request.headers.items())
1181
+ headers.pop("host", None)
1182
+ headers.pop("content-length", None)
1183
+ tags = self._extract_tags(headers)
1184
+
1185
+ # Rate limiting
1186
+ if self.rate_limiter:
1187
+ rate_key = headers.get("x-api-key", "default")[:16]
1188
+ allowed, wait_seconds = await self.rate_limiter.check_request(rate_key)
1189
+ if not allowed:
1190
+ await self.metrics.record_rate_limited()
1191
+ raise HTTPException(
1192
+ status_code=429,
1193
+ detail=f"Rate limited. Retry after {wait_seconds:.1f}s",
1194
+ headers={"Retry-After": str(int(wait_seconds) + 1)},
1195
+ )
1196
+
1197
+ # Budget check
1198
+ if self.cost_tracker:
1199
+ allowed, remaining = self.cost_tracker.check_budget()
1200
+ if not allowed:
1201
+ raise HTTPException(
1202
+ status_code=429,
1203
+ detail=f"Budget exceeded for {self.config.budget_period} period",
1204
+ )
1205
+
1206
+ # Check cache (non-streaming only)
1207
+ cache_hit = False
1208
+ if self.cache and not stream:
1209
+ cached = await self.cache.get(messages, model)
1210
+ if cached:
1211
+ cache_hit = True
1212
+ optimization_latency = (time.time() - start_time) * 1000
1213
+
1214
+ await self.metrics.record_request(
1215
+ provider="anthropic",
1216
+ model=model,
1217
+ input_tokens=0,
1218
+ output_tokens=0,
1219
+ tokens_saved=cached.tokens_saved_per_hit,
1220
+ latency_ms=optimization_latency,
1221
+ cached=True,
1222
+ )
1223
+
1224
+ # Remove compression headers from cached response
1225
+ response_headers = dict(cached.response_headers)
1226
+ response_headers.pop("content-encoding", None)
1227
+ response_headers.pop("content-length", None)
1228
+
1229
+ return Response(
1230
+ content=cached.response_body,
1231
+ headers=response_headers,
1232
+ media_type="application/json",
1233
+ )
1234
+
1235
+ # Count original tokens
1236
+ tokenizer = get_tokenizer(model)
1237
+ original_tokens = sum(tokenizer.count_text(str(m.get("content", ""))) for m in messages)
1238
+
1239
+ # Apply optimization
1240
+ transforms_applied = []
1241
+ optimized_messages = messages
1242
+ optimized_tokens = original_tokens
1243
+
1244
+ if self.config.optimize and messages:
1245
+ try:
1246
+ context_limit = self.anthropic_provider.get_context_limit(model)
1247
+ result = self.anthropic_pipeline.apply(
1248
+ messages=messages,
1249
+ model=model,
1250
+ model_limit=context_limit,
1251
+ )
1252
+
1253
+ if result.messages != messages:
1254
+ optimized_messages = result.messages
1255
+ transforms_applied = result.transforms_applied
1256
+ optimized_tokens = sum(
1257
+ tokenizer.count_text(str(m.get("content", ""))) for m in optimized_messages
1258
+ )
1259
+ except Exception as e:
1260
+ logger.warning(f"Optimization failed: {e}")
1261
+
1262
+ tokens_saved = original_tokens - optimized_tokens
1263
+ optimization_latency = (time.time() - start_time) * 1000
1264
+
1265
+ # CCR Tool Injection: Inject retrieval tool if compression occurred
1266
+ tools = body.get("tools")
1267
+ if self.config.ccr_inject_tool or self.config.ccr_inject_system_instructions:
1268
+ # Create fresh injector to avoid state leakage between requests
1269
+ injector = CCRToolInjector(
1270
+ provider="anthropic",
1271
+ inject_tool=self.config.ccr_inject_tool,
1272
+ inject_system_instructions=self.config.ccr_inject_system_instructions,
1273
+ )
1274
+ optimized_messages, tools, was_injected = injector.process_request(
1275
+ optimized_messages, tools
1276
+ )
1277
+
1278
+ if injector.has_compressed_content:
1279
+ if was_injected:
1280
+ logger.debug(
1281
+ f"[{request_id}] CCR: Injected retrieval tool for hashes: {injector.detected_hashes}"
1282
+ )
1283
+ else:
1284
+ logger.debug(
1285
+ f"[{request_id}] CCR: Tool already present (MCP?), skipped injection for hashes: {injector.detected_hashes}"
1286
+ )
1287
+
1288
+ # Track compression in context tracker for multi-turn awareness
1289
+ if self.ccr_context_tracker:
1290
+ self._turn_counter += 1
1291
+ for hash_key in injector.detected_hashes:
1292
+ # Get compression metadata from store
1293
+ store = get_compression_store()
1294
+ entry = store.get_metadata(hash_key)
1295
+ if entry:
1296
+ self.ccr_context_tracker.track_compression(
1297
+ hash_key=hash_key,
1298
+ turn_number=self._turn_counter,
1299
+ tool_name=entry.get("tool_name"),
1300
+ original_count=entry.get("original_item_count", 0),
1301
+ compressed_count=entry.get("compressed_item_count", 0),
1302
+ query_context=entry.get("query_context", ""),
1303
+ sample_content=entry.get("compressed_content", "")[:500],
1304
+ )
1305
+
1306
+ # CCR Proactive Expansion: Check if current query needs expanded context
1307
+ if self.ccr_context_tracker and self.config.ccr_proactive_expansion:
1308
+ # Extract user query from messages
1309
+ user_query = ""
1310
+ for msg in reversed(messages):
1311
+ if msg.get("role") == "user":
1312
+ content = msg.get("content", "")
1313
+ if isinstance(content, str):
1314
+ user_query = content
1315
+ elif isinstance(content, list):
1316
+ for block in content:
1317
+ if isinstance(block, dict) and block.get("type") == "text":
1318
+ user_query = block.get("text", "")
1319
+ break
1320
+ break
1321
+
1322
+ if user_query:
1323
+ recommendations = self.ccr_context_tracker.analyze_query(
1324
+ user_query, self._turn_counter
1325
+ )
1326
+ if recommendations:
1327
+ expansions = self.ccr_context_tracker.execute_expansions(recommendations)
1328
+ if expansions:
1329
+ # Add expanded context to the system message or as additional context
1330
+ expansion_text = self.ccr_context_tracker.format_expansions_for_context(
1331
+ expansions
1332
+ )
1333
+ logger.info(
1334
+ f"[{request_id}] CCR: Proactively expanded {len(expansions)} context(s) "
1335
+ f"based on query relevance"
1336
+ )
1337
+ # Append to the last user message
1338
+ if optimized_messages and optimized_messages[-1].get("role") == "user":
1339
+ last_msg = optimized_messages[-1]
1340
+ content = last_msg.get("content", "")
1341
+ if isinstance(content, str):
1342
+ optimized_messages[-1] = {
1343
+ **last_msg,
1344
+ "content": content + "\n\n" + expansion_text,
1345
+ }
1346
+
1347
+ # Update body
1348
+ body["messages"] = optimized_messages
1349
+ if tools is not None:
1350
+ body["tools"] = tools
1351
+
1352
+ # Forward request
1353
+ url = f"{self.ANTHROPIC_API_URL}/v1/messages"
1354
+
1355
+ try:
1356
+ if stream:
1357
+ return await self._stream_response(
1358
+ url,
1359
+ headers,
1360
+ body,
1361
+ "anthropic",
1362
+ model,
1363
+ request_id,
1364
+ original_tokens,
1365
+ optimized_tokens,
1366
+ tokens_saved,
1367
+ transforms_applied,
1368
+ tags,
1369
+ optimization_latency,
1370
+ )
1371
+ else:
1372
+ response = await self._retry_request("POST", url, headers, body)
1373
+
1374
+ # Parse response for CCR handling
1375
+ resp_json = None
1376
+ try:
1377
+ resp_json = response.json()
1378
+ except Exception:
1379
+ pass
1380
+
1381
+ # CCR Response Handling: Handle headroom_retrieve tool calls automatically
1382
+ if (
1383
+ self.ccr_response_handler
1384
+ and resp_json
1385
+ and response.status_code == 200
1386
+ and self.ccr_response_handler.has_ccr_tool_calls(resp_json, "anthropic")
1387
+ ):
1388
+ logger.info(f"[{request_id}] CCR: Detected retrieval tool call, handling...")
1389
+
1390
+ # Create API call function for continuation
1391
+ # Use a fresh client to avoid potential decompression state issues
1392
+ async def api_call_fn(
1393
+ msgs: list[dict], tls: list[dict] | None
1394
+ ) -> dict[str, Any]:
1395
+ continuation_body = {
1396
+ **body,
1397
+ "messages": msgs,
1398
+ }
1399
+ if tls is not None:
1400
+ continuation_body["tools"] = tls
1401
+
1402
+ # Use clean headers for continuation
1403
+ continuation_headers = {
1404
+ k: v
1405
+ for k, v in headers.items()
1406
+ if k.lower()
1407
+ not in (
1408
+ "content-encoding",
1409
+ "transfer-encoding",
1410
+ "accept-encoding",
1411
+ "content-length",
1412
+ )
1413
+ }
1414
+
1415
+ # Use a fresh client for CCR continuations
1416
+ logger.info(f"CCR: Making continuation request with {len(msgs)} messages")
1417
+ async with httpx.AsyncClient(
1418
+ timeout=httpx.Timeout(120.0),
1419
+ ) as ccr_client:
1420
+ try:
1421
+ cont_response = await ccr_client.post(
1422
+ url,
1423
+ json=continuation_body,
1424
+ headers=continuation_headers,
1425
+ )
1426
+ logger.info(
1427
+ f"CCR: Got response status={cont_response.status_code}, "
1428
+ f"content-encoding={cont_response.headers.get('content-encoding')}"
1429
+ )
1430
+ result: dict[str, Any] = cont_response.json()
1431
+ logger.info("CCR: Parsed JSON successfully")
1432
+ return result
1433
+ except Exception as e:
1434
+ logger.error(
1435
+ f"CCR: API call failed: {e}, "
1436
+ f"response headers: {dict(cont_response.headers) if 'cont_response' in dir() else 'N/A'}"
1437
+ )
1438
+ raise
1439
+
1440
+ # Handle CCR tool calls
1441
+ try:
1442
+ final_resp_json = await self.ccr_response_handler.handle_response(
1443
+ resp_json,
1444
+ optimized_messages,
1445
+ tools,
1446
+ api_call_fn,
1447
+ provider="anthropic",
1448
+ )
1449
+ # Update response content with final response
1450
+ resp_json = final_resp_json
1451
+ # Remove encoding headers since content is now uncompressed JSON
1452
+ ccr_response_headers = {
1453
+ k: v
1454
+ for k, v in response.headers.items()
1455
+ if k.lower() not in ("content-encoding", "content-length")
1456
+ }
1457
+ response = httpx.Response(
1458
+ status_code=200,
1459
+ content=json.dumps(final_resp_json).encode(),
1460
+ headers=ccr_response_headers,
1461
+ )
1462
+ logger.info(f"[{request_id}] CCR: Retrieval handled successfully")
1463
+ except Exception as e:
1464
+ import traceback
1465
+
1466
+ logger.warning(
1467
+ f"[{request_id}] CCR: Response handling failed: {e}\n"
1468
+ f"Traceback: {traceback.format_exc()}"
1469
+ )
1470
+ # Continue with original response
1471
+
1472
+ total_latency = (time.time() - start_time) * 1000
1473
+
1474
+ # Parse response for output tokens
1475
+ output_tokens = 0
1476
+ if resp_json:
1477
+ usage = resp_json.get("usage", {})
1478
+ output_tokens = usage.get("output_tokens", 0)
1479
+
1480
+ # Calculate cost
1481
+ cost_usd = None
1482
+ savings_usd = None
1483
+ if self.cost_tracker:
1484
+ cost_usd = self.cost_tracker.estimate_cost(
1485
+ model, optimized_tokens, output_tokens
1486
+ )
1487
+ original_cost = self.cost_tracker.estimate_cost(
1488
+ model, original_tokens, output_tokens
1489
+ )
1490
+ if cost_usd and original_cost:
1491
+ savings_usd = original_cost - cost_usd
1492
+ self.cost_tracker.record_cost(cost_usd)
1493
+ self.cost_tracker.record_savings(savings_usd)
1494
+
1495
+ # Cache response
1496
+ if self.cache and response.status_code == 200:
1497
+ await self.cache.set(
1498
+ messages,
1499
+ model,
1500
+ response.content,
1501
+ dict(response.headers),
1502
+ tokens_saved=tokens_saved,
1503
+ )
1504
+
1505
+ # Record metrics
1506
+ await self.metrics.record_request(
1507
+ provider="anthropic",
1508
+ model=model,
1509
+ input_tokens=optimized_tokens,
1510
+ output_tokens=output_tokens,
1511
+ tokens_saved=tokens_saved,
1512
+ latency_ms=total_latency,
1513
+ cost_usd=cost_usd or 0,
1514
+ savings_usd=savings_usd or 0,
1515
+ )
1516
+
1517
+ # Log request
1518
+ if self.logger:
1519
+ self.logger.log(
1520
+ RequestLog(
1521
+ request_id=request_id,
1522
+ timestamp=datetime.now().isoformat(),
1523
+ provider="anthropic",
1524
+ model=model,
1525
+ input_tokens_original=original_tokens,
1526
+ input_tokens_optimized=optimized_tokens,
1527
+ output_tokens=output_tokens,
1528
+ tokens_saved=tokens_saved,
1529
+ savings_percent=(tokens_saved / original_tokens * 100)
1530
+ if original_tokens > 0
1531
+ else 0,
1532
+ estimated_cost_usd=cost_usd,
1533
+ estimated_savings_usd=savings_usd,
1534
+ optimization_latency_ms=optimization_latency,
1535
+ total_latency_ms=total_latency,
1536
+ tags=tags,
1537
+ cache_hit=cache_hit,
1538
+ transforms_applied=transforms_applied,
1539
+ request_messages=messages if self.config.log_full_messages else None,
1540
+ )
1541
+ )
1542
+
1543
+ # Log to console
1544
+ if tokens_saved > 0:
1545
+ logger.info(
1546
+ f"[{request_id}] {model}: {original_tokens:,} → {optimized_tokens:,} "
1547
+ f"(saved {tokens_saved:,} tokens, ${savings_usd:.4f})"
1548
+ if savings_usd
1549
+ else f"[{request_id}] {model}: {original_tokens:,} → {optimized_tokens:,} "
1550
+ f"(saved {tokens_saved:,} tokens)"
1551
+ )
1552
+
1553
+ # Remove compression headers since httpx already decompressed the response
1554
+ response_headers = dict(response.headers)
1555
+ response_headers.pop("content-encoding", None)
1556
+ response_headers.pop("content-length", None) # Length changed after decompression
1557
+
1558
+ return Response(
1559
+ content=response.content,
1560
+ status_code=response.status_code,
1561
+ headers=response_headers,
1562
+ )
1563
+
1564
+ except Exception as e:
1565
+ await self.metrics.record_failed()
1566
+ # Log full error details internally for debugging
1567
+ logger.error(f"[{request_id}] Request failed: {type(e).__name__}: {e}")
1568
+
1569
+ # Try fallback if enabled
1570
+ if self.config.fallback_enabled and self.config.fallback_provider == "openai":
1571
+ logger.info(f"[{request_id}] Attempting fallback to OpenAI")
1572
+ # Convert to OpenAI format and retry
1573
+ # (simplified - would need message format conversion)
1574
+
1575
+ # Return sanitized error message to client (don't expose internal details)
1576
+ return JSONResponse(
1577
+ status_code=502,
1578
+ content={
1579
+ "type": "error",
1580
+ "error": {
1581
+ "type": "api_error",
1582
+ "message": "An error occurred while processing your request. Please try again.",
1583
+ },
1584
+ },
1585
+ )
1586
+
1587
+ async def _stream_response(
1588
+ self,
1589
+ url: str,
1590
+ headers: dict,
1591
+ body: dict,
1592
+ provider: str,
1593
+ model: str,
1594
+ request_id: str,
1595
+ original_tokens: int,
1596
+ optimized_tokens: int,
1597
+ tokens_saved: int,
1598
+ transforms_applied: list[str],
1599
+ tags: dict[str, str],
1600
+ optimization_latency: float,
1601
+ ) -> StreamingResponse:
1602
+ """Stream response with metrics tracking.
1603
+
1604
+ Calculates output size incrementally to avoid accumulating all chunks in memory.
1605
+ """
1606
+ start_time = time.time()
1607
+
1608
+ async def generate():
1609
+ # Track total bytes incrementally instead of accumulating chunks
1610
+ total_bytes = 0
1611
+ try:
1612
+ async with self.http_client.stream(
1613
+ "POST", url, json=body, headers=headers
1614
+ ) as response:
1615
+ async for chunk in response.aiter_bytes():
1616
+ total_bytes += len(chunk)
1617
+ yield chunk
1618
+ finally:
1619
+ # Record metrics after stream completes
1620
+ total_latency = (time.time() - start_time) * 1000
1621
+
1622
+ # Estimate output tokens from total bytes (rough estimate: ~4 bytes per token)
1623
+ output_tokens = total_bytes // 4
1624
+
1625
+ await self.metrics.record_request(
1626
+ provider=provider,
1627
+ model=model,
1628
+ input_tokens=optimized_tokens,
1629
+ output_tokens=output_tokens,
1630
+ tokens_saved=tokens_saved,
1631
+ latency_ms=total_latency,
1632
+ )
1633
+
1634
+ if tokens_saved > 0:
1635
+ logger.info(
1636
+ f"[{request_id}] {model}: saved {tokens_saved:,} tokens (streaming)"
1637
+ )
1638
+
1639
+ return StreamingResponse(
1640
+ generate(),
1641
+ media_type="text/event-stream",
1642
+ )
1643
+
1644
+ async def handle_openai_chat(
1645
+ self,
1646
+ request: Request,
1647
+ ) -> Response | StreamingResponse:
1648
+ """Handle OpenAI /v1/chat/completions endpoint."""
1649
+ start_time = time.time()
1650
+ request_id = await self._next_request_id()
1651
+
1652
+ # Check request body size
1653
+ content_length = request.headers.get("content-length")
1654
+ if content_length and int(content_length) > MAX_REQUEST_BODY_SIZE:
1655
+ return JSONResponse(
1656
+ status_code=413,
1657
+ content={
1658
+ "error": {
1659
+ "message": f"Request body too large. Maximum size is {MAX_REQUEST_BODY_SIZE // (1024 * 1024)}MB",
1660
+ "type": "invalid_request_error",
1661
+ "code": "request_too_large",
1662
+ }
1663
+ },
1664
+ )
1665
+
1666
+ # Parse request
1667
+ try:
1668
+ body = await request.json()
1669
+ except json.JSONDecodeError as e:
1670
+ return JSONResponse(
1671
+ status_code=400,
1672
+ content={
1673
+ "error": {
1674
+ "message": f"Invalid JSON in request body: {e!s}",
1675
+ "type": "invalid_request_error",
1676
+ "code": "invalid_json",
1677
+ }
1678
+ },
1679
+ )
1680
+ model = body.get("model", "unknown")
1681
+ messages = body.get("messages", [])
1682
+ stream = body.get("stream", False)
1683
+
1684
+ headers = dict(request.headers.items())
1685
+ headers.pop("host", None)
1686
+ headers.pop("content-length", None)
1687
+ tags = self._extract_tags(headers)
1688
+
1689
+ # Rate limiting
1690
+ if self.rate_limiter:
1691
+ rate_key = headers.get("authorization", "default")[:20]
1692
+ allowed, wait_seconds = await self.rate_limiter.check_request(rate_key)
1693
+ if not allowed:
1694
+ await self.metrics.record_rate_limited()
1695
+ raise HTTPException(
1696
+ status_code=429,
1697
+ detail=f"Rate limited. Retry after {wait_seconds:.1f}s",
1698
+ )
1699
+
1700
+ # Check cache
1701
+ if self.cache and not stream:
1702
+ cached = await self.cache.get(messages, model)
1703
+ if cached:
1704
+ await self.metrics.record_request(
1705
+ provider="openai",
1706
+ model=model,
1707
+ input_tokens=0,
1708
+ output_tokens=0,
1709
+ tokens_saved=cached.tokens_saved_per_hit,
1710
+ latency_ms=(time.time() - start_time) * 1000,
1711
+ cached=True,
1712
+ )
1713
+
1714
+ # Remove compression headers from cached response
1715
+ response_headers = dict(cached.response_headers)
1716
+ response_headers.pop("content-encoding", None)
1717
+ response_headers.pop("content-length", None)
1718
+
1719
+ return Response(content=cached.response_body, headers=response_headers)
1720
+
1721
+ # Token counting
1722
+ tokenizer = get_tokenizer(model)
1723
+ original_tokens = sum(tokenizer.count_text(str(m.get("content", ""))) for m in messages)
1724
+
1725
+ # Optimization
1726
+ transforms_applied = []
1727
+ optimized_messages = messages
1728
+ optimized_tokens = original_tokens
1729
+
1730
+ if self.config.optimize and messages:
1731
+ try:
1732
+ context_limit = self.openai_provider.get_context_limit(model)
1733
+ result = self.openai_pipeline.apply(
1734
+ messages=messages,
1735
+ model=model,
1736
+ model_limit=context_limit,
1737
+ )
1738
+ if result.messages != messages:
1739
+ optimized_messages = result.messages
1740
+ transforms_applied = result.transforms_applied
1741
+ optimized_tokens = sum(
1742
+ tokenizer.count_text(str(m.get("content", ""))) for m in optimized_messages
1743
+ )
1744
+ except Exception as e:
1745
+ logger.warning(f"Optimization failed: {e}")
1746
+
1747
+ tokens_saved = original_tokens - optimized_tokens
1748
+ optimization_latency = (time.time() - start_time) * 1000
1749
+
1750
+ # CCR Tool Injection: Inject retrieval tool if compression occurred
1751
+ tools = body.get("tools")
1752
+ if self.config.ccr_inject_tool or self.config.ccr_inject_system_instructions:
1753
+ injector = CCRToolInjector(
1754
+ provider="openai",
1755
+ inject_tool=self.config.ccr_inject_tool,
1756
+ inject_system_instructions=self.config.ccr_inject_system_instructions,
1757
+ )
1758
+ optimized_messages, tools, was_injected = injector.process_request(
1759
+ optimized_messages, tools
1760
+ )
1761
+
1762
+ if injector.has_compressed_content:
1763
+ if was_injected:
1764
+ logger.debug(
1765
+ f"[{request_id}] CCR: Injected retrieval tool for hashes: {injector.detected_hashes}"
1766
+ )
1767
+ else:
1768
+ logger.debug(
1769
+ f"[{request_id}] CCR: Tool already present (MCP?), skipped injection for hashes: {injector.detected_hashes}"
1770
+ )
1771
+
1772
+ body["messages"] = optimized_messages
1773
+ if tools is not None:
1774
+ body["tools"] = tools
1775
+ url = f"{self.OPENAI_API_URL}/v1/chat/completions"
1776
+
1777
+ try:
1778
+ if stream:
1779
+ return await self._stream_response(
1780
+ url,
1781
+ headers,
1782
+ body,
1783
+ "openai",
1784
+ model,
1785
+ request_id,
1786
+ original_tokens,
1787
+ optimized_tokens,
1788
+ tokens_saved,
1789
+ transforms_applied,
1790
+ tags,
1791
+ optimization_latency,
1792
+ )
1793
+ else:
1794
+ response = await self._retry_request("POST", url, headers, body)
1795
+ total_latency = (time.time() - start_time) * 1000
1796
+
1797
+ output_tokens = 0
1798
+ try:
1799
+ resp_json = response.json()
1800
+ usage = resp_json.get("usage", {})
1801
+ output_tokens = usage.get("completion_tokens", 0)
1802
+ except Exception:
1803
+ pass
1804
+
1805
+ # Cost tracking
1806
+ cost_usd = savings_usd = None
1807
+ if self.cost_tracker:
1808
+ cost_usd = self.cost_tracker.estimate_cost(
1809
+ model, optimized_tokens, output_tokens
1810
+ )
1811
+ original_cost = self.cost_tracker.estimate_cost(
1812
+ model, original_tokens, output_tokens
1813
+ )
1814
+ if cost_usd and original_cost:
1815
+ savings_usd = original_cost - cost_usd
1816
+ self.cost_tracker.record_cost(cost_usd)
1817
+ self.cost_tracker.record_savings(savings_usd)
1818
+
1819
+ # Cache
1820
+ if self.cache and response.status_code == 200:
1821
+ await self.cache.set(
1822
+ messages, model, response.content, dict(response.headers), tokens_saved
1823
+ )
1824
+
1825
+ # Metrics
1826
+ await self.metrics.record_request(
1827
+ provider="openai",
1828
+ model=model,
1829
+ input_tokens=optimized_tokens,
1830
+ output_tokens=output_tokens,
1831
+ tokens_saved=tokens_saved,
1832
+ latency_ms=total_latency,
1833
+ cost_usd=cost_usd or 0,
1834
+ savings_usd=savings_usd or 0,
1835
+ )
1836
+
1837
+ if tokens_saved > 0:
1838
+ logger.info(
1839
+ f"[{request_id}] {model}: {original_tokens:,} → {optimized_tokens:,} "
1840
+ f"(saved {tokens_saved:,} tokens)"
1841
+ )
1842
+
1843
+ # Remove compression headers since httpx already decompressed the response
1844
+ response_headers = dict(response.headers)
1845
+ response_headers.pop("content-encoding", None)
1846
+ response_headers.pop("content-length", None) # Length changed after decompression
1847
+
1848
+ return Response(
1849
+ content=response.content,
1850
+ status_code=response.status_code,
1851
+ headers=response_headers,
1852
+ )
1853
+ except Exception as e:
1854
+ await self.metrics.record_failed()
1855
+ # Log full error details internally for debugging
1856
+ logger.error(f"[{request_id}] OpenAI request failed: {type(e).__name__}: {e}")
1857
+ # Return sanitized error message to client (don't expose internal details)
1858
+ return JSONResponse(
1859
+ status_code=502,
1860
+ content={
1861
+ "error": {
1862
+ "message": "An error occurred while processing your request. Please try again.",
1863
+ "type": "server_error",
1864
+ "code": "proxy_error",
1865
+ }
1866
+ },
1867
+ )
1868
+
1869
+ async def handle_passthrough(self, request: Request, base_url: str) -> Response:
1870
+ """Pass through request unchanged."""
1871
+ path = request.url.path
1872
+ url = f"{base_url}{path}"
1873
+
1874
+ headers = dict(request.headers.items())
1875
+ headers.pop("host", None)
1876
+
1877
+ body = await request.body()
1878
+
1879
+ response = await self.http_client.request( # type: ignore[union-attr]
1880
+ method=request.method,
1881
+ url=url,
1882
+ headers=headers,
1883
+ content=body,
1884
+ )
1885
+
1886
+ # Remove compression headers since httpx already decompressed the response
1887
+ response_headers = dict(response.headers)
1888
+ response_headers.pop("content-encoding", None)
1889
+ response_headers.pop("content-length", None) # Length changed after decompression
1890
+
1891
+ return Response(
1892
+ content=response.content,
1893
+ status_code=response.status_code,
1894
+ headers=response_headers,
1895
+ )
1896
+
1897
+
1898
+ # =============================================================================
1899
+ # FastAPI App
1900
+ # =============================================================================
1901
+
1902
+
1903
+ def create_app(config: ProxyConfig | None = None) -> FastAPI:
1904
+ """Create FastAPI application."""
1905
+ if not FASTAPI_AVAILABLE:
1906
+ raise ImportError("FastAPI required. Install: pip install fastapi uvicorn httpx")
1907
+
1908
+ config = config or ProxyConfig()
1909
+
1910
+ app = FastAPI(
1911
+ title="Headroom Proxy",
1912
+ description="Production-ready LLM optimization proxy",
1913
+ version="1.0.0",
1914
+ )
1915
+
1916
+ # CORS
1917
+ app.add_middleware(
1918
+ CORSMiddleware,
1919
+ allow_origins=["*"],
1920
+ allow_credentials=True,
1921
+ allow_methods=["*"],
1922
+ allow_headers=["*"],
1923
+ )
1924
+
1925
+ proxy = HeadroomProxy(config)
1926
+
1927
+ @app.on_event("startup")
1928
+ async def startup():
1929
+ await proxy.startup()
1930
+
1931
+ @app.on_event("shutdown")
1932
+ async def shutdown():
1933
+ await proxy.shutdown()
1934
+
1935
+ # Health & Metrics
1936
+ @app.get("/health")
1937
+ async def health():
1938
+ return {
1939
+ "status": "healthy",
1940
+ "version": "1.0.0",
1941
+ "config": {
1942
+ "optimize": config.optimize,
1943
+ "cache": config.cache_enabled,
1944
+ "rate_limit": config.rate_limit_enabled,
1945
+ },
1946
+ }
1947
+
1948
+ @app.get("/stats")
1949
+ async def stats():
1950
+ """Get comprehensive proxy statistics.
1951
+
1952
+ This is the main stats endpoint - it aggregates data from all subsystems:
1953
+ - Request metrics (total, cached, failed, by model/provider)
1954
+ - Token usage and savings
1955
+ - Cost tracking
1956
+ - Compression (CCR) statistics
1957
+ - Telemetry/TOIN (data flywheel) statistics
1958
+ - Cache and rate limiter stats
1959
+ """
1960
+ m = proxy.metrics
1961
+
1962
+ # Calculate average latency
1963
+ avg_latency_ms = round(m.latency_sum_ms / m.latency_count, 2) if m.latency_count > 0 else 0
1964
+
1965
+ # Get compression store stats
1966
+ store = get_compression_store()
1967
+ compression_stats = store.get_stats()
1968
+
1969
+ # Get telemetry/TOIN stats
1970
+ telemetry = get_telemetry_collector()
1971
+ telemetry_stats = telemetry.get_stats()
1972
+
1973
+ # Get feedback loop stats
1974
+ feedback = get_compression_feedback()
1975
+ feedback_stats = feedback.get_stats()
1976
+
1977
+ # Calculate total tokens before compression
1978
+ total_tokens_before = m.tokens_input_total + m.tokens_saved_total
1979
+
1980
+ return {
1981
+ "requests": {
1982
+ "total": m.requests_total,
1983
+ "cached": m.requests_cached,
1984
+ "rate_limited": m.requests_rate_limited,
1985
+ "failed": m.requests_failed,
1986
+ "by_provider": dict(m.requests_by_provider),
1987
+ "by_model": dict(m.requests_by_model),
1988
+ },
1989
+ "tokens": {
1990
+ "input": m.tokens_input_total,
1991
+ "output": m.tokens_output_total,
1992
+ "saved": m.tokens_saved_total,
1993
+ "total_before_compression": total_tokens_before,
1994
+ "savings_percent": round(
1995
+ (m.tokens_saved_total / total_tokens_before * 100)
1996
+ if total_tokens_before > 0
1997
+ else 0,
1998
+ 2,
1999
+ ),
2000
+ },
2001
+ "latency": {
2002
+ "average_ms": avg_latency_ms,
2003
+ "total_requests": m.latency_count,
2004
+ },
2005
+ "cost": proxy.cost_tracker.stats() if proxy.cost_tracker else None,
2006
+ "compression": {
2007
+ "ccr_entries": compression_stats.get("entry_count", 0),
2008
+ "ccr_max_entries": compression_stats.get("max_entries", 0),
2009
+ "original_tokens_cached": compression_stats.get("total_original_tokens", 0),
2010
+ "compressed_tokens_cached": compression_stats.get("total_compressed_tokens", 0),
2011
+ "ccr_retrievals": compression_stats.get("total_retrievals", 0),
2012
+ },
2013
+ "telemetry": {
2014
+ "enabled": telemetry_stats.get("enabled", False),
2015
+ "total_compressions": telemetry_stats.get("total_compressions", 0),
2016
+ "total_retrievals": telemetry_stats.get("total_retrievals", 0),
2017
+ "global_retrieval_rate": round(telemetry_stats.get("global_retrieval_rate", 0), 4),
2018
+ "tool_signatures_tracked": telemetry_stats.get("tool_signatures_tracked", 0),
2019
+ "avg_compression_ratio": round(telemetry_stats.get("avg_compression_ratio", 0), 4),
2020
+ "avg_token_reduction": round(telemetry_stats.get("avg_token_reduction", 0), 4),
2021
+ },
2022
+ "feedback_loop": {
2023
+ "tools_tracked": feedback_stats.get("tools_tracked", 0),
2024
+ "total_compressions": feedback_stats.get("total_compressions", 0),
2025
+ "total_retrievals": feedback_stats.get("total_retrievals", 0),
2026
+ "global_retrieval_rate": round(feedback_stats.get("global_retrieval_rate", 0), 4),
2027
+ "tools_with_high_retrieval": sum(
2028
+ 1
2029
+ for p in feedback_stats.get("tool_patterns", {}).values()
2030
+ if p.get("retrieval_rate", 0) > 0.3
2031
+ ),
2032
+ },
2033
+ "cache": await proxy.cache.stats() if proxy.cache else None,
2034
+ "rate_limiter": await proxy.rate_limiter.stats() if proxy.rate_limiter else None,
2035
+ "recent_requests": proxy.logger.get_recent(10) if proxy.logger else [],
2036
+ }
2037
+
2038
+ @app.get("/metrics")
2039
+ async def metrics():
2040
+ """Prometheus metrics endpoint."""
2041
+ return PlainTextResponse(
2042
+ await proxy.metrics.export(),
2043
+ media_type="text/plain; version=0.0.4",
2044
+ )
2045
+
2046
+ @app.post("/cache/clear")
2047
+ async def clear_cache():
2048
+ """Clear the response cache."""
2049
+ if proxy.cache:
2050
+ await proxy.cache.clear()
2051
+ return {"status": "cleared"}
2052
+ return {"status": "cache disabled"}
2053
+
2054
+ # CCR (Compress-Cache-Retrieve) endpoints
2055
+ @app.post("/v1/retrieve")
2056
+ async def ccr_retrieve(request: Request):
2057
+ """Retrieve original content from CCR compression cache.
2058
+
2059
+ This is the "Retrieve" part of CCR (Compress-Cache-Retrieve).
2060
+ When SmartCrusher compresses tool outputs, the original data is cached.
2061
+ LLMs can call this endpoint to get more data if needed.
2062
+
2063
+ Request body:
2064
+ hash (str): Hash key from compression marker (required)
2065
+ query (str): Optional search query to filter results
2066
+
2067
+ Response:
2068
+ Full retrieval: {"hash": "...", "original_content": "...", ...}
2069
+ Search: {"hash": "...", "query": "...", "results": [...], "count": N}
2070
+ """
2071
+ data = await request.json()
2072
+ hash_key = data.get("hash")
2073
+ query = data.get("query")
2074
+
2075
+ if not hash_key:
2076
+ raise HTTPException(status_code=400, detail="hash required")
2077
+
2078
+ store = get_compression_store()
2079
+
2080
+ if query:
2081
+ # Search within cached content
2082
+ results = store.search(hash_key, query)
2083
+ return {
2084
+ "hash": hash_key,
2085
+ "query": query,
2086
+ "results": results,
2087
+ "count": len(results),
2088
+ }
2089
+ else:
2090
+ # Return full original content
2091
+ entry = store.retrieve(hash_key)
2092
+ if entry:
2093
+ return {
2094
+ "hash": hash_key,
2095
+ "original_content": entry.original_content,
2096
+ "original_tokens": entry.original_tokens,
2097
+ "original_item_count": entry.original_item_count,
2098
+ "compressed_item_count": entry.compressed_item_count,
2099
+ "tool_name": entry.tool_name,
2100
+ "retrieval_count": entry.retrieval_count,
2101
+ }
2102
+ raise HTTPException(
2103
+ status_code=404, detail="Entry not found or expired (TTL: 5 minutes)"
2104
+ )
2105
+
2106
+ @app.get("/v1/retrieve/stats")
2107
+ async def ccr_stats():
2108
+ """Get CCR compression store statistics."""
2109
+ store = get_compression_store()
2110
+ stats = store.get_stats()
2111
+ events = store.get_retrieval_events(limit=20)
2112
+ return {
2113
+ "store": stats,
2114
+ "recent_retrievals": [
2115
+ {
2116
+ "hash": e.hash,
2117
+ "query": e.query,
2118
+ "items_retrieved": e.items_retrieved,
2119
+ "total_items": e.total_items,
2120
+ "tool_name": e.tool_name,
2121
+ "retrieval_type": e.retrieval_type,
2122
+ }
2123
+ for e in events
2124
+ ],
2125
+ }
2126
+
2127
+ @app.get("/v1/feedback")
2128
+ async def ccr_feedback():
2129
+ """Get CCR feedback loop statistics and learned patterns.
2130
+
2131
+ This endpoint exposes the feedback loop's learned patterns for monitoring
2132
+ and debugging. It shows:
2133
+ - Per-tool retrieval rates (high = compress less aggressively)
2134
+ - Common search queries per tool
2135
+ - Queried fields (suggest what to preserve)
2136
+
2137
+ Use this to understand how well compression is working and whether
2138
+ the feedback loop is adjusting appropriately.
2139
+ """
2140
+ feedback = get_compression_feedback()
2141
+ stats = feedback.get_stats()
2142
+ return {
2143
+ "feedback": stats,
2144
+ "hints_example": {
2145
+ tool_name: {
2146
+ "hints": {
2147
+ "max_items": hints.max_items
2148
+ if (hints := feedback.get_compression_hints(tool_name))
2149
+ else 15,
2150
+ "suggested_items": hints.suggested_items if hints else None,
2151
+ "skip_compression": hints.skip_compression if hints else False,
2152
+ "preserve_fields": hints.preserve_fields if hints else [],
2153
+ "reason": hints.reason if hints else "",
2154
+ }
2155
+ }
2156
+ for tool_name in list(stats.get("tool_patterns", {}).keys())[:5]
2157
+ },
2158
+ }
2159
+
2160
+ @app.get("/v1/feedback/{tool_name}")
2161
+ async def ccr_feedback_for_tool(tool_name: str):
2162
+ """Get compression hints for a specific tool.
2163
+
2164
+ Returns feedback-based hints that would be used for compressing
2165
+ this tool's output.
2166
+ """
2167
+ feedback = get_compression_feedback()
2168
+ hints = feedback.get_compression_hints(tool_name)
2169
+ patterns = feedback.get_all_patterns().get(tool_name)
2170
+
2171
+ return {
2172
+ "tool_name": tool_name,
2173
+ "hints": {
2174
+ "max_items": hints.max_items,
2175
+ "min_items": hints.min_items,
2176
+ "suggested_items": hints.suggested_items,
2177
+ "aggressiveness": hints.aggressiveness,
2178
+ "skip_compression": hints.skip_compression,
2179
+ "preserve_fields": hints.preserve_fields,
2180
+ "reason": hints.reason,
2181
+ },
2182
+ "pattern": {
2183
+ "total_compressions": patterns.total_compressions if patterns else 0,
2184
+ "total_retrievals": patterns.total_retrievals if patterns else 0,
2185
+ "retrieval_rate": patterns.retrieval_rate if patterns else 0.0,
2186
+ "full_retrieval_rate": patterns.full_retrieval_rate if patterns else 0.0,
2187
+ "search_rate": patterns.search_rate if patterns else 0.0,
2188
+ "common_queries": list(patterns.common_queries.keys())[:10] if patterns else [],
2189
+ "queried_fields": list(patterns.queried_fields.keys())[:10] if patterns else [],
2190
+ }
2191
+ if patterns
2192
+ else None,
2193
+ }
2194
+
2195
+ # Telemetry endpoints (Data Flywheel)
2196
+ @app.get("/v1/telemetry")
2197
+ async def telemetry_stats():
2198
+ """Get telemetry statistics for the data flywheel.
2199
+
2200
+ This endpoint exposes privacy-preserving telemetry data that powers
2201
+ the data flywheel - learning optimal compression strategies across
2202
+ tool types based on usage patterns.
2203
+
2204
+ What's collected (anonymized):
2205
+ - Tool output structure patterns (field types, not values)
2206
+ - Compression decisions and ratios
2207
+ - Retrieval patterns (rate, type, not content)
2208
+ - Strategy effectiveness
2209
+
2210
+ What's NOT collected:
2211
+ - Actual data values
2212
+ - User identifiers
2213
+ - Queries or search terms
2214
+ - File paths or tool names (hashed by default)
2215
+ """
2216
+ telemetry = get_telemetry_collector()
2217
+ return telemetry.get_stats()
2218
+
2219
+ @app.get("/v1/telemetry/export")
2220
+ async def telemetry_export():
2221
+ """Export full telemetry data for aggregation.
2222
+
2223
+ This endpoint exports all telemetry data in a format suitable for
2224
+ cross-user aggregation. The data is privacy-preserving - no actual
2225
+ values are included, only structural patterns and statistics.
2226
+
2227
+ Use this for:
2228
+ - Building a central learning service
2229
+ - Sharing learned patterns across instances
2230
+ - Analysis and debugging
2231
+ """
2232
+ telemetry = get_telemetry_collector()
2233
+ return telemetry.export_stats()
2234
+
2235
+ @app.post("/v1/telemetry/import")
2236
+ async def telemetry_import(request: Request):
2237
+ """Import telemetry data from another source.
2238
+
2239
+ This allows merging telemetry from multiple sources for cross-user
2240
+ learning. The imported data is merged with existing statistics.
2241
+
2242
+ Request body: Telemetry export data from /v1/telemetry/export
2243
+ """
2244
+ telemetry = get_telemetry_collector()
2245
+ data = await request.json()
2246
+ telemetry.import_stats(data)
2247
+ return {"status": "imported", "current_stats": telemetry.get_stats()}
2248
+
2249
+ @app.get("/v1/telemetry/tools")
2250
+ async def telemetry_tools():
2251
+ """Get telemetry statistics for all tracked tool signatures.
2252
+
2253
+ Returns statistics per tool signature (anonymized), including:
2254
+ - Compression ratios and strategy usage
2255
+ - Retrieval rates (high = compression too aggressive)
2256
+ - Learned recommendations
2257
+ """
2258
+ telemetry = get_telemetry_collector()
2259
+ all_stats = telemetry.get_all_tool_stats()
2260
+ return {
2261
+ "tool_count": len(all_stats),
2262
+ "tools": {sig_hash: stats.to_dict() for sig_hash, stats in all_stats.items()},
2263
+ }
2264
+
2265
+ @app.get("/v1/telemetry/tools/{signature_hash}")
2266
+ async def telemetry_tool_detail(signature_hash: str):
2267
+ """Get detailed telemetry for a specific tool signature.
2268
+
2269
+ Includes learned recommendations if enough data has been collected.
2270
+ """
2271
+ telemetry = get_telemetry_collector()
2272
+ stats = telemetry.get_tool_stats(signature_hash)
2273
+ recommendations = telemetry.get_recommendations(signature_hash)
2274
+
2275
+ if stats is None:
2276
+ raise HTTPException(
2277
+ status_code=404, detail=f"No telemetry found for signature: {signature_hash}"
2278
+ )
2279
+
2280
+ return {
2281
+ "signature_hash": signature_hash,
2282
+ "stats": stats.to_dict(),
2283
+ "recommendations": recommendations,
2284
+ }
2285
+
2286
+ @app.get("/v1/retrieve/{hash_key}")
2287
+ async def ccr_retrieve_get(hash_key: str, query: str | None = None):
2288
+ """GET version of CCR retrieve for easier testing."""
2289
+ store = get_compression_store()
2290
+
2291
+ if query:
2292
+ results = store.search(hash_key, query)
2293
+ return {
2294
+ "hash": hash_key,
2295
+ "query": query,
2296
+ "results": results,
2297
+ "count": len(results),
2298
+ }
2299
+ else:
2300
+ entry = store.retrieve(hash_key)
2301
+ if entry:
2302
+ return {
2303
+ "hash": hash_key,
2304
+ "original_content": entry.original_content,
2305
+ "original_tokens": entry.original_tokens,
2306
+ "original_item_count": entry.original_item_count,
2307
+ "compressed_item_count": entry.compressed_item_count,
2308
+ "tool_name": entry.tool_name,
2309
+ "retrieval_count": entry.retrieval_count,
2310
+ }
2311
+ raise HTTPException(status_code=404, detail="Entry not found or expired")
2312
+
2313
+ # CCR Tool Call Handler - for agent frameworks to call when LLM uses headroom_retrieve
2314
+ @app.post("/v1/retrieve/tool_call")
2315
+ async def ccr_handle_tool_call(request: Request):
2316
+ """Handle a CCR tool call from an LLM response.
2317
+
2318
+ This endpoint accepts tool call formats from various providers and returns
2319
+ a properly formatted tool result. Agent frameworks can use this to handle
2320
+ CCR tool calls without implementing the retrieval logic themselves.
2321
+
2322
+ Request body (Anthropic format):
2323
+ {
2324
+ "tool_call": {
2325
+ "id": "toolu_123",
2326
+ "name": "headroom_retrieve",
2327
+ "input": {"hash": "abc123", "query": "optional search"}
2328
+ },
2329
+ "provider": "anthropic"
2330
+ }
2331
+
2332
+ Request body (OpenAI format):
2333
+ {
2334
+ "tool_call": {
2335
+ "id": "call_123",
2336
+ "function": {
2337
+ "name": "headroom_retrieve",
2338
+ "arguments": "{\"hash\": \"abc123\"}"
2339
+ }
2340
+ },
2341
+ "provider": "openai"
2342
+ }
2343
+
2344
+ Response:
2345
+ {
2346
+ "tool_result": {...}, # Formatted for the provider
2347
+ "success": true,
2348
+ "data": {...} # Raw retrieval data
2349
+ }
2350
+ """
2351
+ data = await request.json()
2352
+ tool_call = data.get("tool_call", {})
2353
+ provider = data.get("provider", "anthropic")
2354
+
2355
+ # Parse the tool call
2356
+ hash_key, query = parse_tool_call(tool_call, provider)
2357
+
2358
+ if hash_key is None:
2359
+ raise HTTPException(
2360
+ status_code=400, detail=f"Invalid tool call or not a {CCR_TOOL_NAME} call"
2361
+ )
2362
+
2363
+ # Perform retrieval
2364
+ store = get_compression_store()
2365
+
2366
+ if query:
2367
+ results = store.search(hash_key, query)
2368
+ retrieval_data = {
2369
+ "hash": hash_key,
2370
+ "query": query,
2371
+ "results": results,
2372
+ "count": len(results),
2373
+ }
2374
+ else:
2375
+ entry = store.retrieve(hash_key)
2376
+ if entry:
2377
+ retrieval_data = {
2378
+ "hash": hash_key,
2379
+ "original_content": entry.original_content,
2380
+ "original_item_count": entry.original_item_count,
2381
+ "compressed_item_count": entry.compressed_item_count,
2382
+ }
2383
+ else:
2384
+ retrieval_data = {
2385
+ "error": "Entry not found or expired (TTL: 5 minutes)",
2386
+ "hash": hash_key,
2387
+ }
2388
+
2389
+ # Format tool result for provider
2390
+ tool_call_id = tool_call.get("id", "")
2391
+ result_content = json.dumps(retrieval_data, indent=2)
2392
+
2393
+ if provider == "anthropic":
2394
+ tool_result = {
2395
+ "type": "tool_result",
2396
+ "tool_use_id": tool_call_id,
2397
+ "content": result_content,
2398
+ }
2399
+ elif provider == "openai":
2400
+ tool_result = {
2401
+ "role": "tool",
2402
+ "tool_call_id": tool_call_id,
2403
+ "content": result_content,
2404
+ }
2405
+ else:
2406
+ tool_result = {
2407
+ "tool_call_id": tool_call_id,
2408
+ "content": result_content,
2409
+ }
2410
+
2411
+ return {
2412
+ "tool_result": tool_result,
2413
+ "success": "error" not in retrieval_data,
2414
+ "data": retrieval_data,
2415
+ }
2416
+
2417
+ # Anthropic endpoints
2418
+ @app.post("/v1/messages")
2419
+ async def anthropic_messages(request: Request):
2420
+ return await proxy.handle_anthropic_messages(request)
2421
+
2422
+ @app.post("/v1/messages/count_tokens")
2423
+ async def anthropic_count_tokens(request: Request):
2424
+ return await proxy.handle_passthrough(request, proxy.ANTHROPIC_API_URL)
2425
+
2426
+ # OpenAI endpoints
2427
+ @app.post("/v1/chat/completions")
2428
+ async def openai_chat(request: Request):
2429
+ return await proxy.handle_openai_chat(request)
2430
+
2431
+ # Passthrough - route to correct backend based on headers
2432
+ @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
2433
+ async def passthrough(request: Request, path: str):
2434
+ # Anthropic SDK always sends anthropic-version header and uses x-api-key for auth
2435
+ # OpenAI SDK uses Authorization: Bearer for auth
2436
+ if request.headers.get("anthropic-version") or request.headers.get("x-api-key"):
2437
+ base_url = proxy.ANTHROPIC_API_URL
2438
+ else:
2439
+ base_url = proxy.OPENAI_API_URL
2440
+ return await proxy.handle_passthrough(request, base_url)
2441
+
2442
+ return app
2443
+
2444
+
2445
+ def _get_llmlingua_banner_status(config: ProxyConfig) -> str:
2446
+ """Get LLMLingua status line for banner."""
2447
+ if config.llmlingua_enabled:
2448
+ if _LLMLINGUA_AVAILABLE:
2449
+ return (
2450
+ f"ENABLED (device={config.llmlingua_device}, rate={config.llmlingua_target_rate})"
2451
+ )
2452
+ else:
2453
+ return "NOT INSTALLED (pip install headroom-ai[llmlingua])"
2454
+ else:
2455
+ if _LLMLINGUA_AVAILABLE:
2456
+ return "DISABLED (remove --no-llmlingua to enable)"
2457
+ return "DISABLED"
2458
+
2459
+
2460
+ def _get_code_aware_banner_status(config: ProxyConfig) -> str:
2461
+ """Get code-aware compression status line for banner."""
2462
+ if config.code_aware_enabled:
2463
+ if is_tree_sitter_available():
2464
+ return "ENABLED (AST-based)"
2465
+ else:
2466
+ return "NOT INSTALLED (pip install headroom-ai[code])"
2467
+ else:
2468
+ if is_tree_sitter_available():
2469
+ return "DISABLED (remove --no-code-aware to enable)"
2470
+ return "DISABLED"
2471
+
2472
+
2473
+ def run_server(config: ProxyConfig | None = None):
2474
+ """Run the proxy server."""
2475
+ if not FASTAPI_AVAILABLE:
2476
+ print("ERROR: FastAPI required. Install: pip install fastapi uvicorn httpx")
2477
+ sys.exit(1)
2478
+
2479
+ config = config or ProxyConfig()
2480
+ app = create_app(config)
2481
+
2482
+ llmlingua_status = _get_llmlingua_banner_status(config)
2483
+ code_aware_status = _get_code_aware_banner_status(config)
2484
+
2485
+ print(f"""
2486
+ ╔══════════════════════════════════════════════════════════════════════╗
2487
+ ║ HEADROOM PROXY SERVER ║
2488
+ ╠══════════════════════════════════════════════════════════════════════╣
2489
+ ║ Version: 1.0.0 ║
2490
+ ║ Listening: http://{config.host}:{config.port:<5} ║
2491
+ ╠══════════════════════════════════════════════════════════════════════╣
2492
+ ║ FEATURES: ║
2493
+ ║ Optimization: {"ENABLED " if config.optimize else "DISABLED"} ║
2494
+ ║ Caching: {"ENABLED " if config.cache_enabled else "DISABLED"} (TTL: {config.cache_ttl_seconds}s) ║
2495
+ ║ Rate Limiting: {"ENABLED " if config.rate_limit_enabled else "DISABLED"} ({config.rate_limit_requests_per_minute} req/min, {config.rate_limit_tokens_per_minute:,} tok/min) ║
2496
+ ║ Retry: {"ENABLED " if config.retry_enabled else "DISABLED"} (max {config.retry_max_attempts} attempts) ║
2497
+ ║ Cost Tracking: {"ENABLED " if config.cost_tracking_enabled else "DISABLED"} (budget: {"$" + str(config.budget_limit_usd) + "/" + config.budget_period if config.budget_limit_usd else "unlimited"}) ║
2498
+ ║ LLMLingua: {llmlingua_status:<52}║
2499
+ ║ Code-Aware: {code_aware_status:<52}║
2500
+ ╠══════════════════════════════════════════════════════════════════════╣
2501
+ ║ USAGE: ║
2502
+ ║ Claude Code: ANTHROPIC_BASE_URL=http://{config.host}:{config.port} claude ║
2503
+ ║ Cursor: Set base URL in settings ║
2504
+ ╠══════════════════════════════════════════════════════════════════════╣
2505
+ ║ ENDPOINTS: ║
2506
+ ║ /health Health check ║
2507
+ ║ /stats Detailed statistics ║
2508
+ ║ /metrics Prometheus metrics ║
2509
+ ║ /cache/clear Clear response cache ║
2510
+ ║ /v1/retrieve CCR: Retrieve compressed content ║
2511
+ ║ /v1/retrieve/stats CCR: Compression store stats ║
2512
+ ║ /v1/retrieve/tool_call CCR: Handle LLM tool calls ║
2513
+ ║ /v1/feedback CCR: Feedback loop stats & patterns ║
2514
+ ║ /v1/feedback/{{tool}} CCR: Compression hints for a tool ║
2515
+ ║ /v1/telemetry Data flywheel: Telemetry stats ║
2516
+ ║ /v1/telemetry/export Data flywheel: Export for aggregation ║
2517
+ ║ /v1/telemetry/tools Data flywheel: Per-tool stats ║
2518
+ ╚══════════════════════════════════════════════════════════════════════╝
2519
+ """)
2520
+
2521
+ uvicorn.run(app, host=config.host, port=config.port, log_level="warning")
2522
+
2523
+
2524
+ def _get_env_bool(name: str, default: bool) -> bool:
2525
+ """Get boolean from environment variable."""
2526
+ val = os.environ.get(name)
2527
+ if val is None:
2528
+ return default
2529
+ return val.lower() in ("true", "1", "yes", "on")
2530
+
2531
+
2532
+ def _get_env_int(name: str, default: int) -> int:
2533
+ """Get integer from environment variable."""
2534
+ val = os.environ.get(name)
2535
+ if val is None:
2536
+ return default
2537
+ try:
2538
+ return int(val)
2539
+ except ValueError:
2540
+ return default
2541
+
2542
+
2543
+ def _get_env_float(name: str, default: float) -> float:
2544
+ """Get float from environment variable."""
2545
+ val = os.environ.get(name)
2546
+ if val is None:
2547
+ return default
2548
+ try:
2549
+ return float(val)
2550
+ except ValueError:
2551
+ return default
2552
+
2553
+
2554
+ def _get_env_str(name: str, default: str) -> str:
2555
+ """Get string from environment variable."""
2556
+ return os.environ.get(name, default)
2557
+
2558
+
2559
+ if __name__ == "__main__":
2560
+ parser = argparse.ArgumentParser(description="Headroom Proxy Server")
2561
+
2562
+ # Server
2563
+ parser.add_argument("--host", default="127.0.0.1")
2564
+ parser.add_argument("--port", type=int, default=8787)
2565
+
2566
+ # Optimization
2567
+ parser.add_argument("--no-optimize", action="store_true", help="Disable optimization")
2568
+ parser.add_argument("--min-tokens", type=int, default=500, help="Min tokens to crush")
2569
+ parser.add_argument("--max-items", type=int, default=50, help="Max items after crush")
2570
+
2571
+ # Caching
2572
+ parser.add_argument("--no-cache", action="store_true", help="Disable caching")
2573
+ parser.add_argument("--cache-ttl", type=int, default=3600, help="Cache TTL seconds")
2574
+
2575
+ # Rate limiting
2576
+ parser.add_argument("--no-rate-limit", action="store_true", help="Disable rate limiting")
2577
+ parser.add_argument("--rpm", type=int, default=60, help="Requests per minute")
2578
+ parser.add_argument("--tpm", type=int, default=100000, help="Tokens per minute")
2579
+
2580
+ # Cost
2581
+ parser.add_argument("--budget", type=float, help="Budget limit in USD")
2582
+ parser.add_argument("--budget-period", choices=["hourly", "daily", "monthly"], default="daily")
2583
+
2584
+ # Logging
2585
+ parser.add_argument("--log-file", help="Log file path")
2586
+ parser.add_argument("--log-messages", action="store_true", help="Log full messages")
2587
+
2588
+ # Smart routing (content-aware compression)
2589
+ parser.add_argument(
2590
+ "--no-smart-routing",
2591
+ action="store_true",
2592
+ help="Disable smart routing (use legacy sequential pipeline)",
2593
+ )
2594
+
2595
+ # LLMLingua ML-based compression
2596
+ parser.add_argument(
2597
+ "--llmlingua",
2598
+ action="store_true",
2599
+ help="Enable LLMLingua-2 ML-based compression (requires: pip install headroom-ai[llmlingua])",
2600
+ )
2601
+ parser.add_argument(
2602
+ "--no-llmlingua",
2603
+ action="store_true",
2604
+ help="Disable LLMLingua compression",
2605
+ )
2606
+ parser.add_argument(
2607
+ "--llmlingua-device",
2608
+ choices=["auto", "cuda", "cpu", "mps"],
2609
+ default="auto",
2610
+ help="Device for LLMLingua model (default: auto)",
2611
+ )
2612
+ parser.add_argument(
2613
+ "--llmlingua-rate",
2614
+ type=float,
2615
+ default=0.3,
2616
+ help="LLMLingua target compression rate, 0.0-1.0 (default: 0.3 = keep 30%%)",
2617
+ )
2618
+
2619
+ # Code-aware compression
2620
+ parser.add_argument(
2621
+ "--code-aware",
2622
+ action="store_true",
2623
+ help="Enable AST-based code compression (requires: pip install headroom-ai[code])",
2624
+ )
2625
+ parser.add_argument(
2626
+ "--no-code-aware",
2627
+ action="store_true",
2628
+ help="Disable code-aware compression",
2629
+ )
2630
+
2631
+ args = parser.parse_args()
2632
+
2633
+ # Environment variable defaults (HEADROOM_* prefix)
2634
+ # CLI args override env vars, env vars override ProxyConfig defaults
2635
+ env_smart_routing = _get_env_bool("HEADROOM_SMART_ROUTING", True)
2636
+ env_llmlingua = _get_env_bool("HEADROOM_LLMLINGUA_ENABLED", True)
2637
+ env_code_aware = _get_env_bool("HEADROOM_CODE_AWARE_ENABLED", True)
2638
+ env_optimize = _get_env_bool("HEADROOM_OPTIMIZE", True)
2639
+ env_cache = _get_env_bool("HEADROOM_CACHE_ENABLED", True)
2640
+ env_rate_limit = _get_env_bool("HEADROOM_RATE_LIMIT_ENABLED", True)
2641
+
2642
+ # Determine settings: CLI flags override env vars
2643
+ # --no-X explicitly disables, --X explicitly enables, neither uses env var
2644
+ smart_routing = env_smart_routing if not args.no_smart_routing else False
2645
+ llmlingua_enabled = (
2646
+ env_llmlingua
2647
+ if not (args.llmlingua or args.no_llmlingua)
2648
+ else (args.llmlingua or not args.no_llmlingua)
2649
+ )
2650
+ code_aware_enabled = (
2651
+ env_code_aware
2652
+ if not (args.code_aware or args.no_code_aware)
2653
+ else (args.code_aware or not args.no_code_aware)
2654
+ )
2655
+ optimize = env_optimize if not args.no_optimize else False
2656
+ cache_enabled = env_cache if not args.no_cache else False
2657
+ rate_limit_enabled = env_rate_limit if not args.no_rate_limit else False
2658
+
2659
+ config = ProxyConfig(
2660
+ host=_get_env_str("HEADROOM_HOST", args.host),
2661
+ port=_get_env_int("HEADROOM_PORT", args.port),
2662
+ optimize=optimize,
2663
+ min_tokens_to_crush=_get_env_int("HEADROOM_MIN_TOKENS", args.min_tokens),
2664
+ max_items_after_crush=_get_env_int("HEADROOM_MAX_ITEMS", args.max_items),
2665
+ cache_enabled=cache_enabled,
2666
+ cache_ttl_seconds=_get_env_int("HEADROOM_CACHE_TTL", args.cache_ttl),
2667
+ rate_limit_enabled=rate_limit_enabled,
2668
+ rate_limit_requests_per_minute=_get_env_int("HEADROOM_RPM", args.rpm),
2669
+ rate_limit_tokens_per_minute=_get_env_int("HEADROOM_TPM", args.tpm),
2670
+ budget_limit_usd=args.budget,
2671
+ budget_period=args.budget_period,
2672
+ log_file=_get_env_str("HEADROOM_LOG_FILE", args.log_file)
2673
+ if args.log_file
2674
+ else os.environ.get("HEADROOM_LOG_FILE"),
2675
+ log_full_messages=args.log_messages or _get_env_bool("HEADROOM_LOG_MESSAGES", False),
2676
+ smart_routing=smart_routing,
2677
+ llmlingua_enabled=llmlingua_enabled,
2678
+ llmlingua_device=_get_env_str("HEADROOM_LLMLINGUA_DEVICE", args.llmlingua_device),
2679
+ llmlingua_target_rate=_get_env_float("HEADROOM_LLMLINGUA_RATE", args.llmlingua_rate),
2680
+ code_aware_enabled=code_aware_enabled,
2681
+ )
2682
+
2683
+ run_server(config)