headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
headroom/proxy/server.py
ADDED
|
@@ -0,0 +1,2683 @@
|
|
|
1
|
+
"""Headroom Proxy Server - Production Ready.
|
|
2
|
+
|
|
3
|
+
A full-featured LLM proxy with optimization, caching, rate limiting,
|
|
4
|
+
and observability.
|
|
5
|
+
|
|
6
|
+
Features:
|
|
7
|
+
- Context optimization (SmartCrusher, CacheAligner, RollingWindow)
|
|
8
|
+
- Semantic caching (save costs on repeated queries)
|
|
9
|
+
- Rate limiting (token bucket)
|
|
10
|
+
- Retry with exponential backoff
|
|
11
|
+
- Cost tracking and budgets
|
|
12
|
+
- Request tagging and metadata
|
|
13
|
+
- Provider fallback
|
|
14
|
+
- Prometheus metrics
|
|
15
|
+
- Full request/response logging
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
python -m headroom.proxy.server --port 8787
|
|
19
|
+
|
|
20
|
+
# With Claude Code:
|
|
21
|
+
ANTHROPIC_BASE_URL=http://localhost:8787 claude
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import asyncio
|
|
28
|
+
import hashlib
|
|
29
|
+
import json
|
|
30
|
+
import logging
|
|
31
|
+
import os
|
|
32
|
+
import random
|
|
33
|
+
import sys
|
|
34
|
+
import time
|
|
35
|
+
from collections import OrderedDict, defaultdict, deque
|
|
36
|
+
from dataclasses import asdict, dataclass
|
|
37
|
+
from datetime import datetime, timedelta
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
from typing import Any, Literal
|
|
40
|
+
|
|
41
|
+
import httpx
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
import uvicorn
|
|
45
|
+
from fastapi import FastAPI, HTTPException, Request, Response
|
|
46
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
47
|
+
from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
|
|
48
|
+
|
|
49
|
+
FASTAPI_AVAILABLE = True
|
|
50
|
+
except ImportError:
|
|
51
|
+
FASTAPI_AVAILABLE = False
|
|
52
|
+
|
|
53
|
+
# Add parent to path for imports
|
|
54
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
55
|
+
|
|
56
|
+
from headroom.cache.compression_feedback import get_compression_feedback
|
|
57
|
+
from headroom.cache.compression_store import get_compression_store
|
|
58
|
+
from headroom.ccr import (
|
|
59
|
+
CCR_TOOL_NAME,
|
|
60
|
+
CCRResponseHandler,
|
|
61
|
+
CCRToolInjector,
|
|
62
|
+
ContextTracker,
|
|
63
|
+
ContextTrackerConfig,
|
|
64
|
+
ResponseHandlerConfig,
|
|
65
|
+
parse_tool_call,
|
|
66
|
+
)
|
|
67
|
+
from headroom.config import CacheAlignerConfig, CCRConfig, RollingWindowConfig, SmartCrusherConfig
|
|
68
|
+
from headroom.providers import AnthropicProvider, OpenAIProvider
|
|
69
|
+
from headroom.telemetry import get_telemetry_collector
|
|
70
|
+
from headroom.tokenizers import get_tokenizer
|
|
71
|
+
from headroom.transforms import (
|
|
72
|
+
_LLMLINGUA_AVAILABLE,
|
|
73
|
+
CacheAligner,
|
|
74
|
+
CodeAwareCompressor,
|
|
75
|
+
CodeCompressorConfig,
|
|
76
|
+
ContentRouter,
|
|
77
|
+
ContentRouterConfig,
|
|
78
|
+
RollingWindow,
|
|
79
|
+
SmartCrusher,
|
|
80
|
+
TransformPipeline,
|
|
81
|
+
is_tree_sitter_available,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Conditionally import LLMLingua if available
|
|
85
|
+
if _LLMLINGUA_AVAILABLE:
|
|
86
|
+
from headroom.transforms import LLMLinguaCompressor, LLMLinguaConfig
|
|
87
|
+
|
|
88
|
+
# Try to import LiteLLM for pricing
|
|
89
|
+
try:
|
|
90
|
+
import litellm
|
|
91
|
+
|
|
92
|
+
LITELLM_AVAILABLE = True
|
|
93
|
+
except ImportError:
|
|
94
|
+
LITELLM_AVAILABLE = False
|
|
95
|
+
|
|
96
|
+
logging.basicConfig(
|
|
97
|
+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
98
|
+
)
|
|
99
|
+
logger = logging.getLogger("headroom.proxy")
|
|
100
|
+
|
|
101
|
+
# Maximum request body size (10MB)
|
|
102
|
+
MAX_REQUEST_BODY_SIZE = 10 * 1024 * 1024
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# =============================================================================
|
|
106
|
+
# Data Models
|
|
107
|
+
# =============================================================================
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class RequestLog:
|
|
112
|
+
"""Complete log of a single request."""
|
|
113
|
+
|
|
114
|
+
request_id: str
|
|
115
|
+
timestamp: str
|
|
116
|
+
provider: str
|
|
117
|
+
model: str
|
|
118
|
+
|
|
119
|
+
# Tokens
|
|
120
|
+
input_tokens_original: int
|
|
121
|
+
input_tokens_optimized: int
|
|
122
|
+
output_tokens: int | None
|
|
123
|
+
tokens_saved: int
|
|
124
|
+
savings_percent: float
|
|
125
|
+
|
|
126
|
+
# Cost
|
|
127
|
+
estimated_cost_usd: float | None
|
|
128
|
+
estimated_savings_usd: float | None
|
|
129
|
+
|
|
130
|
+
# Performance
|
|
131
|
+
optimization_latency_ms: float
|
|
132
|
+
total_latency_ms: float | None
|
|
133
|
+
|
|
134
|
+
# Metadata
|
|
135
|
+
tags: dict[str, str]
|
|
136
|
+
cache_hit: bool
|
|
137
|
+
transforms_applied: list[str]
|
|
138
|
+
|
|
139
|
+
# Request/Response (optional, for debugging)
|
|
140
|
+
request_messages: list[dict] | None = None
|
|
141
|
+
response_content: str | None = None
|
|
142
|
+
error: str | None = None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@dataclass
|
|
146
|
+
class CacheEntry:
|
|
147
|
+
"""Cached response entry."""
|
|
148
|
+
|
|
149
|
+
response_body: bytes
|
|
150
|
+
response_headers: dict[str, str]
|
|
151
|
+
created_at: datetime
|
|
152
|
+
ttl_seconds: int
|
|
153
|
+
hit_count: int = 0
|
|
154
|
+
tokens_saved_per_hit: int = 0
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@dataclass
|
|
158
|
+
class RateLimitState:
|
|
159
|
+
"""Token bucket rate limiter state."""
|
|
160
|
+
|
|
161
|
+
tokens: float
|
|
162
|
+
last_update: float
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@dataclass
|
|
166
|
+
class ProxyConfig:
|
|
167
|
+
"""Proxy configuration."""
|
|
168
|
+
|
|
169
|
+
# Server
|
|
170
|
+
host: str = "127.0.0.1"
|
|
171
|
+
port: int = 8787
|
|
172
|
+
|
|
173
|
+
# Optimization
|
|
174
|
+
optimize: bool = True
|
|
175
|
+
min_tokens_to_crush: int = 500
|
|
176
|
+
max_items_after_crush: int = 50
|
|
177
|
+
keep_last_turns: int = 4
|
|
178
|
+
|
|
179
|
+
# CCR Tool Injection
|
|
180
|
+
ccr_inject_tool: bool = True # Inject headroom_retrieve tool when compression occurs
|
|
181
|
+
ccr_inject_system_instructions: bool = False # Add instructions to system message
|
|
182
|
+
|
|
183
|
+
# CCR Response Handling (intercept and handle CCR tool calls automatically)
|
|
184
|
+
ccr_handle_responses: bool = True # Handle headroom_retrieve calls in responses
|
|
185
|
+
ccr_max_retrieval_rounds: int = 3 # Max rounds of retrieval before returning
|
|
186
|
+
|
|
187
|
+
# CCR Context Tracking (track compressed content across turns)
|
|
188
|
+
ccr_context_tracking: bool = True # Track compressed contexts for proactive expansion
|
|
189
|
+
ccr_proactive_expansion: bool = True # Proactively expand based on query relevance
|
|
190
|
+
ccr_max_proactive_expansions: int = 2 # Max contexts to proactively expand per turn
|
|
191
|
+
|
|
192
|
+
# LLMLingua ML-based compression (ON by default if installed)
|
|
193
|
+
llmlingua_enabled: bool = True # Enable LLMLingua-2 for ML-based compression
|
|
194
|
+
llmlingua_device: str = "auto" # Device: 'auto', 'cuda', 'cpu', 'mps'
|
|
195
|
+
llmlingua_target_rate: float = 0.3 # Target compression rate (0.3 = keep 30%)
|
|
196
|
+
|
|
197
|
+
# Code-aware compression (ON by default if installed)
|
|
198
|
+
code_aware_enabled: bool = True # Enable AST-based code compression
|
|
199
|
+
|
|
200
|
+
# Smart content routing (routes each message to optimal compressor)
|
|
201
|
+
smart_routing: bool = True # Use ContentRouter for intelligent compression
|
|
202
|
+
|
|
203
|
+
# Caching
|
|
204
|
+
cache_enabled: bool = True
|
|
205
|
+
cache_ttl_seconds: int = 3600 # 1 hour
|
|
206
|
+
cache_max_entries: int = 1000
|
|
207
|
+
|
|
208
|
+
# Rate limiting
|
|
209
|
+
rate_limit_enabled: bool = True
|
|
210
|
+
rate_limit_requests_per_minute: int = 60
|
|
211
|
+
rate_limit_tokens_per_minute: int = 100000
|
|
212
|
+
|
|
213
|
+
# Retry
|
|
214
|
+
retry_enabled: bool = True
|
|
215
|
+
retry_max_attempts: int = 3
|
|
216
|
+
retry_base_delay_ms: int = 1000
|
|
217
|
+
retry_max_delay_ms: int = 30000
|
|
218
|
+
|
|
219
|
+
# Cost tracking
|
|
220
|
+
cost_tracking_enabled: bool = True
|
|
221
|
+
budget_limit_usd: float | None = None # None = unlimited
|
|
222
|
+
budget_period: Literal["hourly", "daily", "monthly"] = "daily"
|
|
223
|
+
|
|
224
|
+
# Logging
|
|
225
|
+
log_requests: bool = True
|
|
226
|
+
log_file: str | None = None
|
|
227
|
+
log_full_messages: bool = False # Privacy: don't log content by default
|
|
228
|
+
|
|
229
|
+
# Fallback
|
|
230
|
+
fallback_enabled: bool = False
|
|
231
|
+
fallback_provider: str | None = None # "openai" or "anthropic"
|
|
232
|
+
|
|
233
|
+
# Timeouts
|
|
234
|
+
request_timeout_seconds: int = 300
|
|
235
|
+
connect_timeout_seconds: int = 10
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# =============================================================================
|
|
239
|
+
# Caching
|
|
240
|
+
# =============================================================================
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class SemanticCache:
|
|
244
|
+
"""Simple semantic cache based on message content hash.
|
|
245
|
+
|
|
246
|
+
Uses OrderedDict for O(1) LRU eviction instead of list with O(n) pop(0).
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
def __init__(self, max_entries: int = 1000, ttl_seconds: int = 3600):
|
|
250
|
+
self.max_entries = max_entries
|
|
251
|
+
self.ttl_seconds = ttl_seconds
|
|
252
|
+
# OrderedDict maintains insertion order and supports O(1) move_to_end/popitem
|
|
253
|
+
self._cache: OrderedDict[str, CacheEntry] = OrderedDict()
|
|
254
|
+
self._lock = asyncio.Lock()
|
|
255
|
+
|
|
256
|
+
def _compute_key(self, messages: list[dict], model: str) -> str:
|
|
257
|
+
"""Compute cache key from messages and model."""
|
|
258
|
+
# Normalize messages for consistent hashing
|
|
259
|
+
normalized = json.dumps(
|
|
260
|
+
{
|
|
261
|
+
"model": model,
|
|
262
|
+
"messages": messages,
|
|
263
|
+
},
|
|
264
|
+
sort_keys=True,
|
|
265
|
+
)
|
|
266
|
+
return hashlib.sha256(normalized.encode()).hexdigest()[:32]
|
|
267
|
+
|
|
268
|
+
async def get(self, messages: list[dict], model: str) -> CacheEntry | None:
|
|
269
|
+
"""Get cached response if exists and not expired."""
|
|
270
|
+
key = self._compute_key(messages, model)
|
|
271
|
+
async with self._lock:
|
|
272
|
+
entry = self._cache.get(key)
|
|
273
|
+
|
|
274
|
+
if entry is None:
|
|
275
|
+
return None
|
|
276
|
+
|
|
277
|
+
# Check expiration
|
|
278
|
+
age = (datetime.now() - entry.created_at).total_seconds()
|
|
279
|
+
if age > entry.ttl_seconds:
|
|
280
|
+
del self._cache[key]
|
|
281
|
+
return None
|
|
282
|
+
|
|
283
|
+
entry.hit_count += 1
|
|
284
|
+
# Move to end for LRU (O(1) operation)
|
|
285
|
+
self._cache.move_to_end(key)
|
|
286
|
+
return entry
|
|
287
|
+
|
|
288
|
+
async def set(
|
|
289
|
+
self,
|
|
290
|
+
messages: list[dict],
|
|
291
|
+
model: str,
|
|
292
|
+
response_body: bytes,
|
|
293
|
+
response_headers: dict[str, str],
|
|
294
|
+
tokens_saved: int = 0,
|
|
295
|
+
):
|
|
296
|
+
"""Cache a response."""
|
|
297
|
+
key = self._compute_key(messages, model)
|
|
298
|
+
|
|
299
|
+
async with self._lock:
|
|
300
|
+
# If key already exists, remove it first to update position
|
|
301
|
+
if key in self._cache:
|
|
302
|
+
del self._cache[key]
|
|
303
|
+
|
|
304
|
+
# Evict oldest entries if at capacity (LRU) - O(1) with popitem
|
|
305
|
+
while len(self._cache) >= self.max_entries:
|
|
306
|
+
self._cache.popitem(last=False) # Remove oldest (first) entry
|
|
307
|
+
|
|
308
|
+
self._cache[key] = CacheEntry(
|
|
309
|
+
response_body=response_body,
|
|
310
|
+
response_headers=response_headers,
|
|
311
|
+
created_at=datetime.now(),
|
|
312
|
+
ttl_seconds=self.ttl_seconds,
|
|
313
|
+
tokens_saved_per_hit=tokens_saved,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
async def stats(self) -> dict:
|
|
317
|
+
"""Get cache statistics."""
|
|
318
|
+
async with self._lock:
|
|
319
|
+
total_hits = sum(e.hit_count for e in self._cache.values())
|
|
320
|
+
return {
|
|
321
|
+
"entries": len(self._cache),
|
|
322
|
+
"max_entries": self.max_entries,
|
|
323
|
+
"total_hits": total_hits,
|
|
324
|
+
"ttl_seconds": self.ttl_seconds,
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
async def clear(self):
|
|
328
|
+
"""Clear all cache entries."""
|
|
329
|
+
async with self._lock:
|
|
330
|
+
self._cache.clear()
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
# =============================================================================
|
|
334
|
+
# Rate Limiting
|
|
335
|
+
# =============================================================================
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class TokenBucketRateLimiter:
|
|
339
|
+
"""Token bucket rate limiter for requests and tokens."""
|
|
340
|
+
|
|
341
|
+
def __init__(
|
|
342
|
+
self,
|
|
343
|
+
requests_per_minute: int = 60,
|
|
344
|
+
tokens_per_minute: int = 100000,
|
|
345
|
+
):
|
|
346
|
+
self.requests_per_minute = requests_per_minute
|
|
347
|
+
self.tokens_per_minute = tokens_per_minute
|
|
348
|
+
|
|
349
|
+
# Per-key buckets (key = API key or IP)
|
|
350
|
+
self._request_buckets: dict[str, RateLimitState] = defaultdict(
|
|
351
|
+
lambda: RateLimitState(tokens=requests_per_minute, last_update=time.time())
|
|
352
|
+
)
|
|
353
|
+
self._token_buckets: dict[str, RateLimitState] = defaultdict(
|
|
354
|
+
lambda: RateLimitState(tokens=tokens_per_minute, last_update=time.time())
|
|
355
|
+
)
|
|
356
|
+
self._lock = asyncio.Lock()
|
|
357
|
+
|
|
358
|
+
def _refill(self, state: RateLimitState, rate_per_minute: float) -> float:
|
|
359
|
+
"""Refill bucket based on elapsed time."""
|
|
360
|
+
now = time.time()
|
|
361
|
+
elapsed = now - state.last_update
|
|
362
|
+
refill = elapsed * (rate_per_minute / 60.0)
|
|
363
|
+
state.tokens = min(rate_per_minute, state.tokens + refill)
|
|
364
|
+
state.last_update = now
|
|
365
|
+
return state.tokens
|
|
366
|
+
|
|
367
|
+
async def check_request(self, key: str = "default") -> tuple[bool, float]:
|
|
368
|
+
"""Check if request is allowed. Returns (allowed, wait_seconds)."""
|
|
369
|
+
async with self._lock:
|
|
370
|
+
state = self._request_buckets[key]
|
|
371
|
+
available = self._refill(state, self.requests_per_minute)
|
|
372
|
+
|
|
373
|
+
if available >= 1:
|
|
374
|
+
state.tokens -= 1
|
|
375
|
+
return True, 0
|
|
376
|
+
|
|
377
|
+
wait_seconds = (1 - available) * (60.0 / self.requests_per_minute)
|
|
378
|
+
return False, wait_seconds
|
|
379
|
+
|
|
380
|
+
async def check_tokens(self, key: str, token_count: int) -> tuple[bool, float]:
|
|
381
|
+
"""Check if token usage is allowed."""
|
|
382
|
+
async with self._lock:
|
|
383
|
+
state = self._token_buckets[key]
|
|
384
|
+
available = self._refill(state, self.tokens_per_minute)
|
|
385
|
+
|
|
386
|
+
if available >= token_count:
|
|
387
|
+
state.tokens -= token_count
|
|
388
|
+
return True, 0
|
|
389
|
+
|
|
390
|
+
wait_seconds = (token_count - available) * (60.0 / self.tokens_per_minute)
|
|
391
|
+
return False, wait_seconds
|
|
392
|
+
|
|
393
|
+
async def stats(self) -> dict:
|
|
394
|
+
"""Get rate limiter statistics."""
|
|
395
|
+
async with self._lock:
|
|
396
|
+
return {
|
|
397
|
+
"requests_per_minute": self.requests_per_minute,
|
|
398
|
+
"tokens_per_minute": self.tokens_per_minute,
|
|
399
|
+
"active_keys": len(self._request_buckets),
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
# =============================================================================
|
|
404
|
+
# Cost Tracking
|
|
405
|
+
# =============================================================================
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
class CostTracker:
|
|
409
|
+
"""Track costs and enforce budgets.
|
|
410
|
+
|
|
411
|
+
Cost history is automatically pruned to prevent unbounded memory growth:
|
|
412
|
+
- Entries older than 24 hours are removed
|
|
413
|
+
- Maximum of 100,000 entries are kept
|
|
414
|
+
"""
|
|
415
|
+
|
|
416
|
+
# Fallback pricing - LiteLLM is preferred source
|
|
417
|
+
# Pricing per 1M tokens (input, output, cached_input)
|
|
418
|
+
PRICING = {
|
|
419
|
+
# Anthropic
|
|
420
|
+
"claude-3-5-sonnet": (3.00, 15.00, 0.30),
|
|
421
|
+
"claude-3-5-haiku": (0.80, 4.00, 0.08),
|
|
422
|
+
"claude-3-opus": (15.00, 75.00, 1.50),
|
|
423
|
+
"claude-sonnet-4": (3.00, 15.00, 0.30),
|
|
424
|
+
"claude-opus-4": (15.00, 75.00, 1.50),
|
|
425
|
+
# OpenAI
|
|
426
|
+
"gpt-4o": (2.50, 10.00, 1.25),
|
|
427
|
+
"gpt-4o-mini": (0.15, 0.60, 0.075),
|
|
428
|
+
"o1": (15.00, 60.00, 7.50),
|
|
429
|
+
"o1-mini": (1.10, 4.40, 0.55),
|
|
430
|
+
"o3-mini": (1.10, 4.40, 0.55),
|
|
431
|
+
"gpt-4-turbo": (10.00, 30.00, 5.00),
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
MAX_COST_ENTRIES = 100_000
|
|
435
|
+
COST_RETENTION_HOURS = 24
|
|
436
|
+
|
|
437
|
+
def __init__(self, budget_limit_usd: float | None = None, budget_period: str = "daily"):
|
|
438
|
+
self.budget_limit_usd = budget_limit_usd
|
|
439
|
+
self.budget_period = budget_period
|
|
440
|
+
|
|
441
|
+
# Cost tracking - using deque for efficient left-side removal
|
|
442
|
+
self._costs: deque[tuple[datetime, float]] = deque(maxlen=self.MAX_COST_ENTRIES)
|
|
443
|
+
self._total_cost_usd: float = 0
|
|
444
|
+
self._total_savings_usd: float = 0
|
|
445
|
+
self._last_prune_time: datetime = datetime.now()
|
|
446
|
+
|
|
447
|
+
def _get_pricing(self, model: str) -> tuple[float, float, float] | None:
|
|
448
|
+
"""Get pricing for model."""
|
|
449
|
+
model_lower = model.lower()
|
|
450
|
+
for prefix, pricing in self.PRICING.items():
|
|
451
|
+
if prefix in model_lower:
|
|
452
|
+
return pricing
|
|
453
|
+
return None
|
|
454
|
+
|
|
455
|
+
def estimate_cost(
|
|
456
|
+
self,
|
|
457
|
+
model: str,
|
|
458
|
+
input_tokens: int,
|
|
459
|
+
output_tokens: int,
|
|
460
|
+
cached_tokens: int = 0,
|
|
461
|
+
) -> float | None:
|
|
462
|
+
"""Estimate cost in USD."""
|
|
463
|
+
# Try LiteLLM first
|
|
464
|
+
if LITELLM_AVAILABLE:
|
|
465
|
+
try:
|
|
466
|
+
cost = litellm.completion_cost(
|
|
467
|
+
model=model,
|
|
468
|
+
prompt_tokens=input_tokens,
|
|
469
|
+
completion_tokens=output_tokens,
|
|
470
|
+
)
|
|
471
|
+
if cost is not None and cost > 0:
|
|
472
|
+
return float(cost)
|
|
473
|
+
except Exception:
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
# Fall back to hardcoded pricing
|
|
477
|
+
pricing = self._get_pricing(model)
|
|
478
|
+
if pricing is None:
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
input_price, output_price, cached_price = pricing
|
|
482
|
+
|
|
483
|
+
regular_input = input_tokens - cached_tokens
|
|
484
|
+
cost = (
|
|
485
|
+
(regular_input / 1_000_000) * input_price
|
|
486
|
+
+ (cached_tokens / 1_000_000) * cached_price
|
|
487
|
+
+ (output_tokens / 1_000_000) * output_price
|
|
488
|
+
)
|
|
489
|
+
return cost
|
|
490
|
+
|
|
491
|
+
def _prune_old_costs(self):
|
|
492
|
+
"""Remove cost entries older than retention period.
|
|
493
|
+
|
|
494
|
+
Called periodically (every 5 minutes) to prevent unbounded memory growth.
|
|
495
|
+
The deque maxlen provides a hard cap, but time-based pruning keeps
|
|
496
|
+
memory usage proportional to actual traffic patterns.
|
|
497
|
+
"""
|
|
498
|
+
now = datetime.now()
|
|
499
|
+
# Only prune every 5 minutes to avoid overhead
|
|
500
|
+
if (now - self._last_prune_time).total_seconds() < 300:
|
|
501
|
+
return
|
|
502
|
+
|
|
503
|
+
self._last_prune_time = now
|
|
504
|
+
cutoff = now - timedelta(hours=self.COST_RETENTION_HOURS)
|
|
505
|
+
|
|
506
|
+
# Remove entries from the left (oldest) while they're older than cutoff
|
|
507
|
+
while self._costs and self._costs[0][0] < cutoff:
|
|
508
|
+
self._costs.popleft()
|
|
509
|
+
|
|
510
|
+
def record_cost(self, cost_usd: float):
|
|
511
|
+
"""Record a cost. Periodically prunes old entries."""
|
|
512
|
+
self._costs.append((datetime.now(), cost_usd))
|
|
513
|
+
self._total_cost_usd += cost_usd
|
|
514
|
+
# Periodically prune old costs to prevent memory growth
|
|
515
|
+
self._prune_old_costs()
|
|
516
|
+
|
|
517
|
+
def record_savings(self, savings_usd: float):
|
|
518
|
+
"""Record savings from optimization."""
|
|
519
|
+
self._total_savings_usd += savings_usd
|
|
520
|
+
|
|
521
|
+
def get_period_cost(self) -> float:
|
|
522
|
+
"""Get cost for current budget period."""
|
|
523
|
+
now = datetime.now()
|
|
524
|
+
|
|
525
|
+
if self.budget_period == "hourly":
|
|
526
|
+
cutoff = now - timedelta(hours=1)
|
|
527
|
+
elif self.budget_period == "daily":
|
|
528
|
+
cutoff = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
529
|
+
else: # monthly
|
|
530
|
+
cutoff = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
|
|
531
|
+
|
|
532
|
+
return sum(cost for ts, cost in self._costs if ts >= cutoff)
|
|
533
|
+
|
|
534
|
+
def check_budget(self) -> tuple[bool, float]:
|
|
535
|
+
"""Check if within budget. Returns (allowed, remaining)."""
|
|
536
|
+
if self.budget_limit_usd is None:
|
|
537
|
+
return True, float("inf")
|
|
538
|
+
|
|
539
|
+
period_cost = self.get_period_cost()
|
|
540
|
+
remaining = self.budget_limit_usd - period_cost
|
|
541
|
+
return remaining > 0, max(0, remaining)
|
|
542
|
+
|
|
543
|
+
def stats(self) -> dict:
|
|
544
|
+
"""Get cost statistics."""
|
|
545
|
+
return {
|
|
546
|
+
"total_cost_usd": round(self._total_cost_usd, 4),
|
|
547
|
+
"total_savings_usd": round(self._total_savings_usd, 4),
|
|
548
|
+
"period_cost_usd": round(self.get_period_cost(), 4),
|
|
549
|
+
"budget_limit_usd": self.budget_limit_usd,
|
|
550
|
+
"budget_period": self.budget_period,
|
|
551
|
+
"budget_remaining_usd": round(self.check_budget()[1], 4)
|
|
552
|
+
if self.budget_limit_usd
|
|
553
|
+
else None,
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
# =============================================================================
|
|
558
|
+
# Prometheus Metrics
|
|
559
|
+
# =============================================================================
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
class PrometheusMetrics:
|
|
563
|
+
"""Prometheus-compatible metrics."""
|
|
564
|
+
|
|
565
|
+
def __init__(self):
|
|
566
|
+
self.requests_total = 0
|
|
567
|
+
self.requests_by_provider: dict[str, int] = defaultdict(int)
|
|
568
|
+
self.requests_by_model: dict[str, int] = defaultdict(int)
|
|
569
|
+
self.requests_cached = 0
|
|
570
|
+
self.requests_rate_limited = 0
|
|
571
|
+
self.requests_failed = 0
|
|
572
|
+
|
|
573
|
+
self.tokens_input_total = 0
|
|
574
|
+
self.tokens_output_total = 0
|
|
575
|
+
self.tokens_saved_total = 0
|
|
576
|
+
|
|
577
|
+
self.latency_sum_ms = 0.0
|
|
578
|
+
self.latency_count = 0
|
|
579
|
+
|
|
580
|
+
self.cost_total_usd = 0.0
|
|
581
|
+
self.savings_total_usd = 0.0
|
|
582
|
+
|
|
583
|
+
self._lock = asyncio.Lock()
|
|
584
|
+
|
|
585
|
+
async def record_request(
|
|
586
|
+
self,
|
|
587
|
+
provider: str,
|
|
588
|
+
model: str,
|
|
589
|
+
input_tokens: int,
|
|
590
|
+
output_tokens: int,
|
|
591
|
+
tokens_saved: int,
|
|
592
|
+
latency_ms: float,
|
|
593
|
+
cached: bool = False,
|
|
594
|
+
cost_usd: float = 0,
|
|
595
|
+
savings_usd: float = 0,
|
|
596
|
+
):
|
|
597
|
+
"""Record metrics for a request."""
|
|
598
|
+
async with self._lock:
|
|
599
|
+
self.requests_total += 1
|
|
600
|
+
self.requests_by_provider[provider] += 1
|
|
601
|
+
self.requests_by_model[model] += 1
|
|
602
|
+
|
|
603
|
+
if cached:
|
|
604
|
+
self.requests_cached += 1
|
|
605
|
+
|
|
606
|
+
self.tokens_input_total += input_tokens
|
|
607
|
+
self.tokens_output_total += output_tokens
|
|
608
|
+
self.tokens_saved_total += tokens_saved
|
|
609
|
+
|
|
610
|
+
self.latency_sum_ms += latency_ms
|
|
611
|
+
self.latency_count += 1
|
|
612
|
+
|
|
613
|
+
self.cost_total_usd += cost_usd
|
|
614
|
+
self.savings_total_usd += savings_usd
|
|
615
|
+
|
|
616
|
+
async def record_rate_limited(self):
|
|
617
|
+
async with self._lock:
|
|
618
|
+
self.requests_rate_limited += 1
|
|
619
|
+
|
|
620
|
+
async def record_failed(self):
|
|
621
|
+
async with self._lock:
|
|
622
|
+
self.requests_failed += 1
|
|
623
|
+
|
|
624
|
+
async def export(self) -> str:
|
|
625
|
+
"""Export metrics in Prometheus format."""
|
|
626
|
+
async with self._lock:
|
|
627
|
+
lines = [
|
|
628
|
+
"# HELP headroom_requests_total Total number of requests",
|
|
629
|
+
"# TYPE headroom_requests_total counter",
|
|
630
|
+
f"headroom_requests_total {self.requests_total}",
|
|
631
|
+
"",
|
|
632
|
+
"# HELP headroom_requests_cached_total Cached request count",
|
|
633
|
+
"# TYPE headroom_requests_cached_total counter",
|
|
634
|
+
f"headroom_requests_cached_total {self.requests_cached}",
|
|
635
|
+
"",
|
|
636
|
+
"# HELP headroom_requests_rate_limited_total Rate limited requests",
|
|
637
|
+
"# TYPE headroom_requests_rate_limited_total counter",
|
|
638
|
+
f"headroom_requests_rate_limited_total {self.requests_rate_limited}",
|
|
639
|
+
"",
|
|
640
|
+
"# HELP headroom_requests_failed_total Failed requests",
|
|
641
|
+
"# TYPE headroom_requests_failed_total counter",
|
|
642
|
+
f"headroom_requests_failed_total {self.requests_failed}",
|
|
643
|
+
"",
|
|
644
|
+
"# HELP headroom_tokens_input_total Total input tokens",
|
|
645
|
+
"# TYPE headroom_tokens_input_total counter",
|
|
646
|
+
f"headroom_tokens_input_total {self.tokens_input_total}",
|
|
647
|
+
"",
|
|
648
|
+
"# HELP headroom_tokens_output_total Total output tokens",
|
|
649
|
+
"# TYPE headroom_tokens_output_total counter",
|
|
650
|
+
f"headroom_tokens_output_total {self.tokens_output_total}",
|
|
651
|
+
"",
|
|
652
|
+
"# HELP headroom_tokens_saved_total Tokens saved by optimization",
|
|
653
|
+
"# TYPE headroom_tokens_saved_total counter",
|
|
654
|
+
f"headroom_tokens_saved_total {self.tokens_saved_total}",
|
|
655
|
+
"",
|
|
656
|
+
"# HELP headroom_latency_ms_sum Sum of request latencies",
|
|
657
|
+
"# TYPE headroom_latency_ms_sum counter",
|
|
658
|
+
f"headroom_latency_ms_sum {self.latency_sum_ms:.2f}",
|
|
659
|
+
"",
|
|
660
|
+
"# HELP headroom_cost_usd_total Total cost in USD",
|
|
661
|
+
"# TYPE headroom_cost_usd_total counter",
|
|
662
|
+
f"headroom_cost_usd_total {self.cost_total_usd:.6f}",
|
|
663
|
+
"",
|
|
664
|
+
"# HELP headroom_savings_usd_total Total savings in USD",
|
|
665
|
+
"# TYPE headroom_savings_usd_total counter",
|
|
666
|
+
f"headroom_savings_usd_total {self.savings_total_usd:.6f}",
|
|
667
|
+
]
|
|
668
|
+
|
|
669
|
+
# Per-provider metrics
|
|
670
|
+
lines.extend(
|
|
671
|
+
[
|
|
672
|
+
"",
|
|
673
|
+
"# HELP headroom_requests_by_provider Requests by provider",
|
|
674
|
+
"# TYPE headroom_requests_by_provider counter",
|
|
675
|
+
]
|
|
676
|
+
)
|
|
677
|
+
for provider, count in self.requests_by_provider.items():
|
|
678
|
+
lines.append(f'headroom_requests_by_provider{{provider="{provider}"}} {count}')
|
|
679
|
+
|
|
680
|
+
# Per-model metrics
|
|
681
|
+
lines.extend(
|
|
682
|
+
[
|
|
683
|
+
"",
|
|
684
|
+
"# HELP headroom_requests_by_model Requests by model",
|
|
685
|
+
"# TYPE headroom_requests_by_model counter",
|
|
686
|
+
]
|
|
687
|
+
)
|
|
688
|
+
for model, count in self.requests_by_model.items():
|
|
689
|
+
lines.append(f'headroom_requests_by_model{{model="{model}"}} {count}')
|
|
690
|
+
|
|
691
|
+
return "\n".join(lines)
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
# =============================================================================
|
|
695
|
+
# Request Logger
|
|
696
|
+
# =============================================================================
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
class RequestLogger:
|
|
700
|
+
"""Log requests to JSONL file.
|
|
701
|
+
|
|
702
|
+
Uses a deque with max 10,000 entries to prevent unbounded memory growth.
|
|
703
|
+
"""
|
|
704
|
+
|
|
705
|
+
MAX_LOG_ENTRIES = 10_000
|
|
706
|
+
|
|
707
|
+
def __init__(self, log_file: str | None = None, log_full_messages: bool = False):
|
|
708
|
+
self.log_file = Path(log_file) if log_file else None
|
|
709
|
+
self.log_full_messages = log_full_messages
|
|
710
|
+
# Use deque with maxlen for automatic FIFO eviction
|
|
711
|
+
self._logs: deque[RequestLog] = deque(maxlen=self.MAX_LOG_ENTRIES)
|
|
712
|
+
|
|
713
|
+
if self.log_file:
|
|
714
|
+
self.log_file.parent.mkdir(parents=True, exist_ok=True)
|
|
715
|
+
|
|
716
|
+
def log(self, entry: RequestLog):
|
|
717
|
+
"""Log a request. Oldest entries are automatically removed when limit reached."""
|
|
718
|
+
self._logs.append(entry)
|
|
719
|
+
|
|
720
|
+
if self.log_file:
|
|
721
|
+
with open(self.log_file, "a") as f:
|
|
722
|
+
log_dict = asdict(entry)
|
|
723
|
+
if not self.log_full_messages:
|
|
724
|
+
log_dict.pop("request_messages", None)
|
|
725
|
+
log_dict.pop("response_content", None)
|
|
726
|
+
f.write(json.dumps(log_dict) + "\n")
|
|
727
|
+
|
|
728
|
+
def get_recent(self, n: int = 100) -> list[dict]:
|
|
729
|
+
"""Get recent log entries."""
|
|
730
|
+
# Convert deque to list for slicing (deque doesn't support slicing)
|
|
731
|
+
entries = list(self._logs)[-n:]
|
|
732
|
+
return [
|
|
733
|
+
{
|
|
734
|
+
k: v
|
|
735
|
+
for k, v in asdict(e).items()
|
|
736
|
+
if k not in ("request_messages", "response_content")
|
|
737
|
+
}
|
|
738
|
+
for e in entries
|
|
739
|
+
]
|
|
740
|
+
|
|
741
|
+
def stats(self) -> dict:
|
|
742
|
+
"""Get logging statistics."""
|
|
743
|
+
return {
|
|
744
|
+
"total_logged": len(self._logs),
|
|
745
|
+
"log_file": str(self.log_file) if self.log_file else None,
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
# =============================================================================
|
|
750
|
+
# Main Proxy
|
|
751
|
+
# =============================================================================
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
class HeadroomProxy:
|
|
755
|
+
"""Production-ready Headroom optimization proxy."""
|
|
756
|
+
|
|
757
|
+
ANTHROPIC_API_URL = "https://api.anthropic.com"
|
|
758
|
+
OPENAI_API_URL = "https://api.openai.com"
|
|
759
|
+
|
|
760
|
+
def __init__(self, config: ProxyConfig):
|
|
761
|
+
self.config = config
|
|
762
|
+
|
|
763
|
+
# Initialize providers
|
|
764
|
+
self.anthropic_provider = AnthropicProvider()
|
|
765
|
+
self.openai_provider = OpenAIProvider()
|
|
766
|
+
|
|
767
|
+
# Initialize transforms based on routing mode
|
|
768
|
+
if config.smart_routing:
|
|
769
|
+
# Smart routing: ContentRouter handles all content types intelligently
|
|
770
|
+
# It lazy-loads compressors (including LLMLingua) only when needed
|
|
771
|
+
router_config = ContentRouterConfig(
|
|
772
|
+
enable_llmlingua=config.llmlingua_enabled,
|
|
773
|
+
enable_code_aware=config.code_aware_enabled,
|
|
774
|
+
)
|
|
775
|
+
transforms = [
|
|
776
|
+
CacheAligner(CacheAlignerConfig(enabled=True)),
|
|
777
|
+
ContentRouter(router_config),
|
|
778
|
+
RollingWindow(
|
|
779
|
+
RollingWindowConfig(
|
|
780
|
+
enabled=True,
|
|
781
|
+
keep_system=True,
|
|
782
|
+
keep_last_turns=config.keep_last_turns,
|
|
783
|
+
)
|
|
784
|
+
),
|
|
785
|
+
]
|
|
786
|
+
self._llmlingua_status = "lazy" if config.llmlingua_enabled else "disabled"
|
|
787
|
+
self._code_aware_status = "lazy" if config.code_aware_enabled else "disabled"
|
|
788
|
+
else:
|
|
789
|
+
# Legacy mode: sequential pipeline
|
|
790
|
+
transforms = [
|
|
791
|
+
CacheAligner(CacheAlignerConfig(enabled=True)),
|
|
792
|
+
SmartCrusher(
|
|
793
|
+
SmartCrusherConfig( # type: ignore[arg-type]
|
|
794
|
+
enabled=True,
|
|
795
|
+
min_tokens_to_crush=config.min_tokens_to_crush,
|
|
796
|
+
max_items_after_crush=config.max_items_after_crush,
|
|
797
|
+
),
|
|
798
|
+
ccr_config=CCRConfig(
|
|
799
|
+
enabled=config.ccr_inject_tool,
|
|
800
|
+
inject_retrieval_marker=config.ccr_inject_tool, # Add CCR markers
|
|
801
|
+
),
|
|
802
|
+
),
|
|
803
|
+
RollingWindow(
|
|
804
|
+
RollingWindowConfig(
|
|
805
|
+
enabled=True,
|
|
806
|
+
keep_system=True,
|
|
807
|
+
keep_last_turns=config.keep_last_turns,
|
|
808
|
+
)
|
|
809
|
+
),
|
|
810
|
+
]
|
|
811
|
+
# Add LLMLingua if enabled and available
|
|
812
|
+
self._llmlingua_status = self._setup_llmlingua(config, transforms)
|
|
813
|
+
# Add CodeAware if enabled and available
|
|
814
|
+
self._code_aware_status = self._setup_code_aware(config, transforms)
|
|
815
|
+
|
|
816
|
+
self.anthropic_pipeline = TransformPipeline(
|
|
817
|
+
transforms=transforms,
|
|
818
|
+
provider=self.anthropic_provider,
|
|
819
|
+
)
|
|
820
|
+
self.openai_pipeline = TransformPipeline(
|
|
821
|
+
transforms=transforms,
|
|
822
|
+
provider=self.openai_provider,
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
# Initialize components
|
|
826
|
+
self.cache = (
|
|
827
|
+
SemanticCache(
|
|
828
|
+
max_entries=config.cache_max_entries,
|
|
829
|
+
ttl_seconds=config.cache_ttl_seconds,
|
|
830
|
+
)
|
|
831
|
+
if config.cache_enabled
|
|
832
|
+
else None
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
self.rate_limiter = (
|
|
836
|
+
TokenBucketRateLimiter(
|
|
837
|
+
requests_per_minute=config.rate_limit_requests_per_minute,
|
|
838
|
+
tokens_per_minute=config.rate_limit_tokens_per_minute,
|
|
839
|
+
)
|
|
840
|
+
if config.rate_limit_enabled
|
|
841
|
+
else None
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
self.cost_tracker = (
|
|
845
|
+
CostTracker(
|
|
846
|
+
budget_limit_usd=config.budget_limit_usd,
|
|
847
|
+
budget_period=config.budget_period,
|
|
848
|
+
)
|
|
849
|
+
if config.cost_tracking_enabled
|
|
850
|
+
else None
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
self.metrics = PrometheusMetrics()
|
|
854
|
+
|
|
855
|
+
self.logger = (
|
|
856
|
+
RequestLogger(
|
|
857
|
+
log_file=config.log_file,
|
|
858
|
+
log_full_messages=config.log_full_messages,
|
|
859
|
+
)
|
|
860
|
+
if config.log_requests
|
|
861
|
+
else None
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
# HTTP client
|
|
865
|
+
self.http_client: httpx.AsyncClient | None = None
|
|
866
|
+
|
|
867
|
+
# Request counter for IDs
|
|
868
|
+
self._request_counter = 0
|
|
869
|
+
self._request_counter_lock = asyncio.Lock()
|
|
870
|
+
|
|
871
|
+
# CCR tool injectors (one per provider)
|
|
872
|
+
self.anthropic_tool_injector = CCRToolInjector(
|
|
873
|
+
provider="anthropic",
|
|
874
|
+
inject_tool=config.ccr_inject_tool,
|
|
875
|
+
inject_system_instructions=config.ccr_inject_system_instructions,
|
|
876
|
+
)
|
|
877
|
+
self.openai_tool_injector = CCRToolInjector(
|
|
878
|
+
provider="openai",
|
|
879
|
+
inject_tool=config.ccr_inject_tool,
|
|
880
|
+
inject_system_instructions=config.ccr_inject_system_instructions,
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
# CCR Response Handler (handles CCR tool calls automatically)
|
|
884
|
+
self.ccr_response_handler = (
|
|
885
|
+
CCRResponseHandler(
|
|
886
|
+
ResponseHandlerConfig(
|
|
887
|
+
enabled=True,
|
|
888
|
+
max_retrieval_rounds=config.ccr_max_retrieval_rounds,
|
|
889
|
+
)
|
|
890
|
+
)
|
|
891
|
+
if config.ccr_handle_responses
|
|
892
|
+
else None
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
# CCR Context Tracker (tracks compressed content across turns)
|
|
896
|
+
self.ccr_context_tracker = (
|
|
897
|
+
ContextTracker(
|
|
898
|
+
ContextTrackerConfig(
|
|
899
|
+
enabled=True,
|
|
900
|
+
proactive_expansion=config.ccr_proactive_expansion,
|
|
901
|
+
max_proactive_expansions=config.ccr_max_proactive_expansions,
|
|
902
|
+
)
|
|
903
|
+
)
|
|
904
|
+
if config.ccr_context_tracking
|
|
905
|
+
else None
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
# Turn counter for context tracking
|
|
909
|
+
self._turn_counter = 0
|
|
910
|
+
|
|
911
|
+
def _setup_llmlingua(self, config: ProxyConfig, transforms: list) -> str:
|
|
912
|
+
"""Set up LLMLingua compression if enabled.
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
config: Proxy configuration
|
|
916
|
+
transforms: Transform list to append to
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
Status string for logging: 'enabled', 'disabled', 'available', 'unavailable'
|
|
920
|
+
"""
|
|
921
|
+
if config.llmlingua_enabled:
|
|
922
|
+
if _LLMLINGUA_AVAILABLE:
|
|
923
|
+
llmlingua_config = LLMLinguaConfig(
|
|
924
|
+
device=config.llmlingua_device,
|
|
925
|
+
target_compression_rate=config.llmlingua_target_rate,
|
|
926
|
+
enable_ccr=config.ccr_inject_tool, # Link to CCR
|
|
927
|
+
)
|
|
928
|
+
# Insert before RollingWindow (which should be last)
|
|
929
|
+
# LLMLingua works best on individual tool outputs before windowing
|
|
930
|
+
transforms.insert(-1, LLMLinguaCompressor(llmlingua_config))
|
|
931
|
+
return "enabled"
|
|
932
|
+
else:
|
|
933
|
+
logger.warning(
|
|
934
|
+
"LLMLingua requested but not installed. "
|
|
935
|
+
"Install with: pip install headroom-ai[llmlingua]"
|
|
936
|
+
)
|
|
937
|
+
return "unavailable"
|
|
938
|
+
else:
|
|
939
|
+
if _LLMLINGUA_AVAILABLE:
|
|
940
|
+
return "available" # Available but not enabled - hint to user
|
|
941
|
+
return "disabled"
|
|
942
|
+
|
|
943
|
+
def _setup_code_aware(self, config: ProxyConfig, transforms: list) -> str:
|
|
944
|
+
"""Set up code-aware compression if enabled.
|
|
945
|
+
|
|
946
|
+
Args:
|
|
947
|
+
config: Proxy configuration
|
|
948
|
+
transforms: Transform list to append to
|
|
949
|
+
|
|
950
|
+
Returns:
|
|
951
|
+
Status string for logging: 'enabled', 'disabled', 'available', 'unavailable'
|
|
952
|
+
"""
|
|
953
|
+
if config.code_aware_enabled:
|
|
954
|
+
if is_tree_sitter_available():
|
|
955
|
+
code_config = CodeCompressorConfig(
|
|
956
|
+
preserve_imports=True,
|
|
957
|
+
preserve_signatures=True,
|
|
958
|
+
preserve_type_annotations=True,
|
|
959
|
+
preserve_error_handlers=True,
|
|
960
|
+
)
|
|
961
|
+
# Insert before RollingWindow (which should be last)
|
|
962
|
+
transforms.insert(-1, CodeAwareCompressor(code_config))
|
|
963
|
+
return "enabled"
|
|
964
|
+
else:
|
|
965
|
+
logger.warning(
|
|
966
|
+
"Code-aware compression requested but tree-sitter not installed. "
|
|
967
|
+
"Install with: pip install headroom-ai[code]"
|
|
968
|
+
)
|
|
969
|
+
return "unavailable"
|
|
970
|
+
else:
|
|
971
|
+
if is_tree_sitter_available():
|
|
972
|
+
return "available" # Available but not enabled
|
|
973
|
+
return "disabled"
|
|
974
|
+
|
|
975
|
+
async def startup(self):
|
|
976
|
+
"""Initialize async resources."""
|
|
977
|
+
self.http_client = httpx.AsyncClient(
|
|
978
|
+
timeout=httpx.Timeout(
|
|
979
|
+
connect=self.config.connect_timeout_seconds,
|
|
980
|
+
read=self.config.request_timeout_seconds,
|
|
981
|
+
write=self.config.request_timeout_seconds,
|
|
982
|
+
pool=self.config.connect_timeout_seconds,
|
|
983
|
+
)
|
|
984
|
+
)
|
|
985
|
+
logger.info("Headroom Proxy started")
|
|
986
|
+
logger.info(f"Optimization: {'ENABLED' if self.config.optimize else 'DISABLED'}")
|
|
987
|
+
logger.info(f"Caching: {'ENABLED' if self.config.cache_enabled else 'DISABLED'}")
|
|
988
|
+
logger.info(f"Rate Limiting: {'ENABLED' if self.config.rate_limit_enabled else 'DISABLED'}")
|
|
989
|
+
|
|
990
|
+
# Smart routing status
|
|
991
|
+
if self.config.smart_routing:
|
|
992
|
+
logger.info("Smart Routing: ENABLED (intelligent content detection)")
|
|
993
|
+
else:
|
|
994
|
+
logger.info("Smart Routing: DISABLED (legacy sequential mode)")
|
|
995
|
+
|
|
996
|
+
# LLMLingua status with helpful hint
|
|
997
|
+
if self._llmlingua_status == "enabled":
|
|
998
|
+
logger.info(
|
|
999
|
+
f"LLMLingua: ENABLED (device={self.config.llmlingua_device}, "
|
|
1000
|
+
f"rate={self.config.llmlingua_target_rate})"
|
|
1001
|
+
)
|
|
1002
|
+
elif self._llmlingua_status == "lazy":
|
|
1003
|
+
logger.info("LLMLingua: LAZY (will load when prose content detected)")
|
|
1004
|
+
elif self._llmlingua_status == "available":
|
|
1005
|
+
logger.info("LLMLingua: available but disabled (use --llmlingua)")
|
|
1006
|
+
elif self._llmlingua_status == "unavailable":
|
|
1007
|
+
logger.info("LLMLingua: not installed (pip install headroom-ai[llmlingua])")
|
|
1008
|
+
elif self._llmlingua_status == "disabled":
|
|
1009
|
+
logger.info("LLMLingua: DISABLED")
|
|
1010
|
+
|
|
1011
|
+
# Code-aware status
|
|
1012
|
+
if self._code_aware_status == "enabled":
|
|
1013
|
+
logger.info("Code-Aware: ENABLED (AST-based compression)")
|
|
1014
|
+
elif self._code_aware_status == "lazy":
|
|
1015
|
+
logger.info("Code-Aware: LAZY (will load when code content detected)")
|
|
1016
|
+
elif self._code_aware_status == "available":
|
|
1017
|
+
logger.info("Code-Aware: available but disabled (use --code-aware)")
|
|
1018
|
+
elif self._code_aware_status == "unavailable":
|
|
1019
|
+
logger.info("Code-Aware: not installed (pip install headroom-ai[code])")
|
|
1020
|
+
elif self._code_aware_status == "disabled":
|
|
1021
|
+
logger.info("Code-Aware: DISABLED")
|
|
1022
|
+
|
|
1023
|
+
# CCR status
|
|
1024
|
+
ccr_features = []
|
|
1025
|
+
if self.config.ccr_inject_tool:
|
|
1026
|
+
ccr_features.append("tool_injection")
|
|
1027
|
+
if self.config.ccr_handle_responses:
|
|
1028
|
+
ccr_features.append("response_handling")
|
|
1029
|
+
if self.config.ccr_context_tracking:
|
|
1030
|
+
ccr_features.append("context_tracking")
|
|
1031
|
+
if self.config.ccr_proactive_expansion:
|
|
1032
|
+
ccr_features.append("proactive_expansion")
|
|
1033
|
+
if ccr_features:
|
|
1034
|
+
logger.info(f"CCR (Compress-Cache-Retrieve): ENABLED ({', '.join(ccr_features)})")
|
|
1035
|
+
else:
|
|
1036
|
+
logger.info("CCR: DISABLED")
|
|
1037
|
+
|
|
1038
|
+
async def shutdown(self):
|
|
1039
|
+
"""Cleanup async resources."""
|
|
1040
|
+
if self.http_client:
|
|
1041
|
+
await self.http_client.aclose()
|
|
1042
|
+
|
|
1043
|
+
# Print final stats
|
|
1044
|
+
self._print_summary()
|
|
1045
|
+
|
|
1046
|
+
def _print_summary(self):
|
|
1047
|
+
"""Print session summary."""
|
|
1048
|
+
m = self.metrics
|
|
1049
|
+
logger.info("=" * 70)
|
|
1050
|
+
logger.info("HEADROOM PROXY SESSION SUMMARY")
|
|
1051
|
+
logger.info("=" * 70)
|
|
1052
|
+
logger.info(f"Total requests: {m.requests_total}")
|
|
1053
|
+
logger.info(f"Cached responses: {m.requests_cached}")
|
|
1054
|
+
logger.info(f"Rate limited: {m.requests_rate_limited}")
|
|
1055
|
+
logger.info(f"Failed: {m.requests_failed}")
|
|
1056
|
+
logger.info(f"Input tokens: {m.tokens_input_total:,}")
|
|
1057
|
+
logger.info(f"Output tokens: {m.tokens_output_total:,}")
|
|
1058
|
+
logger.info(f"Tokens saved: {m.tokens_saved_total:,}")
|
|
1059
|
+
if m.tokens_input_total > 0:
|
|
1060
|
+
savings_pct = (
|
|
1061
|
+
m.tokens_saved_total / (m.tokens_input_total + m.tokens_saved_total)
|
|
1062
|
+
) * 100
|
|
1063
|
+
logger.info(f"Token savings: {savings_pct:.1f}%")
|
|
1064
|
+
logger.info(f"Total cost: ${m.cost_total_usd:.4f}")
|
|
1065
|
+
logger.info(f"Total savings: ${m.savings_total_usd:.4f}")
|
|
1066
|
+
if m.latency_count > 0:
|
|
1067
|
+
avg_latency = m.latency_sum_ms / m.latency_count
|
|
1068
|
+
logger.info(f"Avg latency: {avg_latency:.0f}ms")
|
|
1069
|
+
logger.info("=" * 70)
|
|
1070
|
+
|
|
1071
|
+
async def _next_request_id(self) -> str:
|
|
1072
|
+
"""Generate unique request ID."""
|
|
1073
|
+
async with self._request_counter_lock:
|
|
1074
|
+
self._request_counter += 1
|
|
1075
|
+
return f"hr_{int(time.time())}_{self._request_counter:06d}"
|
|
1076
|
+
|
|
1077
|
+
def _extract_tags(self, headers: dict) -> dict[str, str]:
|
|
1078
|
+
"""Extract Headroom tags from headers."""
|
|
1079
|
+
tags = {}
|
|
1080
|
+
for key, value in headers.items():
|
|
1081
|
+
if key.lower().startswith("x-headroom-"):
|
|
1082
|
+
tag_name = key.lower().replace("x-headroom-", "")
|
|
1083
|
+
tags[tag_name] = value
|
|
1084
|
+
return tags
|
|
1085
|
+
|
|
1086
|
+
async def _retry_request(
|
|
1087
|
+
self,
|
|
1088
|
+
method: str,
|
|
1089
|
+
url: str,
|
|
1090
|
+
headers: dict,
|
|
1091
|
+
body: dict,
|
|
1092
|
+
stream: bool = False,
|
|
1093
|
+
) -> httpx.Response:
|
|
1094
|
+
"""Make request with retry and exponential backoff."""
|
|
1095
|
+
last_error = None
|
|
1096
|
+
|
|
1097
|
+
for attempt in range(self.config.retry_max_attempts):
|
|
1098
|
+
try:
|
|
1099
|
+
if stream:
|
|
1100
|
+
# For streaming, we return early - retry happens at higher level
|
|
1101
|
+
return await self.http_client.post(url, json=body, headers=headers) # type: ignore[union-attr]
|
|
1102
|
+
else:
|
|
1103
|
+
response = await self.http_client.post(url, json=body, headers=headers) # type: ignore[union-attr]
|
|
1104
|
+
|
|
1105
|
+
# Don't retry client errors (4xx)
|
|
1106
|
+
if 400 <= response.status_code < 500:
|
|
1107
|
+
return response
|
|
1108
|
+
|
|
1109
|
+
# Retry server errors (5xx)
|
|
1110
|
+
if response.status_code >= 500:
|
|
1111
|
+
raise httpx.HTTPStatusError(
|
|
1112
|
+
f"Server error: {response.status_code}",
|
|
1113
|
+
request=response.request,
|
|
1114
|
+
response=response,
|
|
1115
|
+
)
|
|
1116
|
+
|
|
1117
|
+
return response
|
|
1118
|
+
|
|
1119
|
+
except (httpx.ConnectError, httpx.ReadTimeout, httpx.HTTPStatusError) as e:
|
|
1120
|
+
last_error = e
|
|
1121
|
+
|
|
1122
|
+
if not self.config.retry_enabled or attempt >= self.config.retry_max_attempts - 1:
|
|
1123
|
+
raise
|
|
1124
|
+
|
|
1125
|
+
# Exponential backoff with jitter
|
|
1126
|
+
delay = min(
|
|
1127
|
+
self.config.retry_base_delay_ms * (2**attempt),
|
|
1128
|
+
self.config.retry_max_delay_ms,
|
|
1129
|
+
)
|
|
1130
|
+
delay_with_jitter = delay * (0.5 + random.random())
|
|
1131
|
+
|
|
1132
|
+
logger.warning(
|
|
1133
|
+
f"Request failed (attempt {attempt + 1}), retrying in {delay_with_jitter:.0f}ms: {e}"
|
|
1134
|
+
)
|
|
1135
|
+
await asyncio.sleep(delay_with_jitter / 1000)
|
|
1136
|
+
|
|
1137
|
+
raise last_error # type: ignore[misc]
|
|
1138
|
+
|
|
1139
|
+
async def handle_anthropic_messages(
|
|
1140
|
+
self,
|
|
1141
|
+
request: Request,
|
|
1142
|
+
) -> Response | StreamingResponse:
|
|
1143
|
+
"""Handle Anthropic /v1/messages endpoint."""
|
|
1144
|
+
start_time = time.time()
|
|
1145
|
+
request_id = await self._next_request_id()
|
|
1146
|
+
|
|
1147
|
+
# Check request body size
|
|
1148
|
+
content_length = request.headers.get("content-length")
|
|
1149
|
+
if content_length and int(content_length) > MAX_REQUEST_BODY_SIZE:
|
|
1150
|
+
return JSONResponse(
|
|
1151
|
+
status_code=413,
|
|
1152
|
+
content={
|
|
1153
|
+
"type": "error",
|
|
1154
|
+
"error": {
|
|
1155
|
+
"type": "request_too_large",
|
|
1156
|
+
"message": f"Request body too large. Maximum size is {MAX_REQUEST_BODY_SIZE // (1024 * 1024)}MB",
|
|
1157
|
+
},
|
|
1158
|
+
},
|
|
1159
|
+
)
|
|
1160
|
+
|
|
1161
|
+
# Parse request
|
|
1162
|
+
try:
|
|
1163
|
+
body = await request.json()
|
|
1164
|
+
except json.JSONDecodeError as e:
|
|
1165
|
+
return JSONResponse(
|
|
1166
|
+
status_code=400,
|
|
1167
|
+
content={
|
|
1168
|
+
"type": "error",
|
|
1169
|
+
"error": {
|
|
1170
|
+
"type": "invalid_request_error",
|
|
1171
|
+
"message": f"Invalid JSON in request body: {e!s}",
|
|
1172
|
+
},
|
|
1173
|
+
},
|
|
1174
|
+
)
|
|
1175
|
+
model = body.get("model", "unknown")
|
|
1176
|
+
messages = body.get("messages", [])
|
|
1177
|
+
stream = body.get("stream", False)
|
|
1178
|
+
|
|
1179
|
+
# Extract headers and tags
|
|
1180
|
+
headers = dict(request.headers.items())
|
|
1181
|
+
headers.pop("host", None)
|
|
1182
|
+
headers.pop("content-length", None)
|
|
1183
|
+
tags = self._extract_tags(headers)
|
|
1184
|
+
|
|
1185
|
+
# Rate limiting
|
|
1186
|
+
if self.rate_limiter:
|
|
1187
|
+
rate_key = headers.get("x-api-key", "default")[:16]
|
|
1188
|
+
allowed, wait_seconds = await self.rate_limiter.check_request(rate_key)
|
|
1189
|
+
if not allowed:
|
|
1190
|
+
await self.metrics.record_rate_limited()
|
|
1191
|
+
raise HTTPException(
|
|
1192
|
+
status_code=429,
|
|
1193
|
+
detail=f"Rate limited. Retry after {wait_seconds:.1f}s",
|
|
1194
|
+
headers={"Retry-After": str(int(wait_seconds) + 1)},
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
# Budget check
|
|
1198
|
+
if self.cost_tracker:
|
|
1199
|
+
allowed, remaining = self.cost_tracker.check_budget()
|
|
1200
|
+
if not allowed:
|
|
1201
|
+
raise HTTPException(
|
|
1202
|
+
status_code=429,
|
|
1203
|
+
detail=f"Budget exceeded for {self.config.budget_period} period",
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
# Check cache (non-streaming only)
|
|
1207
|
+
cache_hit = False
|
|
1208
|
+
if self.cache and not stream:
|
|
1209
|
+
cached = await self.cache.get(messages, model)
|
|
1210
|
+
if cached:
|
|
1211
|
+
cache_hit = True
|
|
1212
|
+
optimization_latency = (time.time() - start_time) * 1000
|
|
1213
|
+
|
|
1214
|
+
await self.metrics.record_request(
|
|
1215
|
+
provider="anthropic",
|
|
1216
|
+
model=model,
|
|
1217
|
+
input_tokens=0,
|
|
1218
|
+
output_tokens=0,
|
|
1219
|
+
tokens_saved=cached.tokens_saved_per_hit,
|
|
1220
|
+
latency_ms=optimization_latency,
|
|
1221
|
+
cached=True,
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
# Remove compression headers from cached response
|
|
1225
|
+
response_headers = dict(cached.response_headers)
|
|
1226
|
+
response_headers.pop("content-encoding", None)
|
|
1227
|
+
response_headers.pop("content-length", None)
|
|
1228
|
+
|
|
1229
|
+
return Response(
|
|
1230
|
+
content=cached.response_body,
|
|
1231
|
+
headers=response_headers,
|
|
1232
|
+
media_type="application/json",
|
|
1233
|
+
)
|
|
1234
|
+
|
|
1235
|
+
# Count original tokens
|
|
1236
|
+
tokenizer = get_tokenizer(model)
|
|
1237
|
+
original_tokens = sum(tokenizer.count_text(str(m.get("content", ""))) for m in messages)
|
|
1238
|
+
|
|
1239
|
+
# Apply optimization
|
|
1240
|
+
transforms_applied = []
|
|
1241
|
+
optimized_messages = messages
|
|
1242
|
+
optimized_tokens = original_tokens
|
|
1243
|
+
|
|
1244
|
+
if self.config.optimize and messages:
|
|
1245
|
+
try:
|
|
1246
|
+
context_limit = self.anthropic_provider.get_context_limit(model)
|
|
1247
|
+
result = self.anthropic_pipeline.apply(
|
|
1248
|
+
messages=messages,
|
|
1249
|
+
model=model,
|
|
1250
|
+
model_limit=context_limit,
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
if result.messages != messages:
|
|
1254
|
+
optimized_messages = result.messages
|
|
1255
|
+
transforms_applied = result.transforms_applied
|
|
1256
|
+
optimized_tokens = sum(
|
|
1257
|
+
tokenizer.count_text(str(m.get("content", ""))) for m in optimized_messages
|
|
1258
|
+
)
|
|
1259
|
+
except Exception as e:
|
|
1260
|
+
logger.warning(f"Optimization failed: {e}")
|
|
1261
|
+
|
|
1262
|
+
tokens_saved = original_tokens - optimized_tokens
|
|
1263
|
+
optimization_latency = (time.time() - start_time) * 1000
|
|
1264
|
+
|
|
1265
|
+
# CCR Tool Injection: Inject retrieval tool if compression occurred
|
|
1266
|
+
tools = body.get("tools")
|
|
1267
|
+
if self.config.ccr_inject_tool or self.config.ccr_inject_system_instructions:
|
|
1268
|
+
# Create fresh injector to avoid state leakage between requests
|
|
1269
|
+
injector = CCRToolInjector(
|
|
1270
|
+
provider="anthropic",
|
|
1271
|
+
inject_tool=self.config.ccr_inject_tool,
|
|
1272
|
+
inject_system_instructions=self.config.ccr_inject_system_instructions,
|
|
1273
|
+
)
|
|
1274
|
+
optimized_messages, tools, was_injected = injector.process_request(
|
|
1275
|
+
optimized_messages, tools
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
if injector.has_compressed_content:
|
|
1279
|
+
if was_injected:
|
|
1280
|
+
logger.debug(
|
|
1281
|
+
f"[{request_id}] CCR: Injected retrieval tool for hashes: {injector.detected_hashes}"
|
|
1282
|
+
)
|
|
1283
|
+
else:
|
|
1284
|
+
logger.debug(
|
|
1285
|
+
f"[{request_id}] CCR: Tool already present (MCP?), skipped injection for hashes: {injector.detected_hashes}"
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
# Track compression in context tracker for multi-turn awareness
|
|
1289
|
+
if self.ccr_context_tracker:
|
|
1290
|
+
self._turn_counter += 1
|
|
1291
|
+
for hash_key in injector.detected_hashes:
|
|
1292
|
+
# Get compression metadata from store
|
|
1293
|
+
store = get_compression_store()
|
|
1294
|
+
entry = store.get_metadata(hash_key)
|
|
1295
|
+
if entry:
|
|
1296
|
+
self.ccr_context_tracker.track_compression(
|
|
1297
|
+
hash_key=hash_key,
|
|
1298
|
+
turn_number=self._turn_counter,
|
|
1299
|
+
tool_name=entry.get("tool_name"),
|
|
1300
|
+
original_count=entry.get("original_item_count", 0),
|
|
1301
|
+
compressed_count=entry.get("compressed_item_count", 0),
|
|
1302
|
+
query_context=entry.get("query_context", ""),
|
|
1303
|
+
sample_content=entry.get("compressed_content", "")[:500],
|
|
1304
|
+
)
|
|
1305
|
+
|
|
1306
|
+
# CCR Proactive Expansion: Check if current query needs expanded context
|
|
1307
|
+
if self.ccr_context_tracker and self.config.ccr_proactive_expansion:
|
|
1308
|
+
# Extract user query from messages
|
|
1309
|
+
user_query = ""
|
|
1310
|
+
for msg in reversed(messages):
|
|
1311
|
+
if msg.get("role") == "user":
|
|
1312
|
+
content = msg.get("content", "")
|
|
1313
|
+
if isinstance(content, str):
|
|
1314
|
+
user_query = content
|
|
1315
|
+
elif isinstance(content, list):
|
|
1316
|
+
for block in content:
|
|
1317
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
1318
|
+
user_query = block.get("text", "")
|
|
1319
|
+
break
|
|
1320
|
+
break
|
|
1321
|
+
|
|
1322
|
+
if user_query:
|
|
1323
|
+
recommendations = self.ccr_context_tracker.analyze_query(
|
|
1324
|
+
user_query, self._turn_counter
|
|
1325
|
+
)
|
|
1326
|
+
if recommendations:
|
|
1327
|
+
expansions = self.ccr_context_tracker.execute_expansions(recommendations)
|
|
1328
|
+
if expansions:
|
|
1329
|
+
# Add expanded context to the system message or as additional context
|
|
1330
|
+
expansion_text = self.ccr_context_tracker.format_expansions_for_context(
|
|
1331
|
+
expansions
|
|
1332
|
+
)
|
|
1333
|
+
logger.info(
|
|
1334
|
+
f"[{request_id}] CCR: Proactively expanded {len(expansions)} context(s) "
|
|
1335
|
+
f"based on query relevance"
|
|
1336
|
+
)
|
|
1337
|
+
# Append to the last user message
|
|
1338
|
+
if optimized_messages and optimized_messages[-1].get("role") == "user":
|
|
1339
|
+
last_msg = optimized_messages[-1]
|
|
1340
|
+
content = last_msg.get("content", "")
|
|
1341
|
+
if isinstance(content, str):
|
|
1342
|
+
optimized_messages[-1] = {
|
|
1343
|
+
**last_msg,
|
|
1344
|
+
"content": content + "\n\n" + expansion_text,
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
# Update body
|
|
1348
|
+
body["messages"] = optimized_messages
|
|
1349
|
+
if tools is not None:
|
|
1350
|
+
body["tools"] = tools
|
|
1351
|
+
|
|
1352
|
+
# Forward request
|
|
1353
|
+
url = f"{self.ANTHROPIC_API_URL}/v1/messages"
|
|
1354
|
+
|
|
1355
|
+
try:
|
|
1356
|
+
if stream:
|
|
1357
|
+
return await self._stream_response(
|
|
1358
|
+
url,
|
|
1359
|
+
headers,
|
|
1360
|
+
body,
|
|
1361
|
+
"anthropic",
|
|
1362
|
+
model,
|
|
1363
|
+
request_id,
|
|
1364
|
+
original_tokens,
|
|
1365
|
+
optimized_tokens,
|
|
1366
|
+
tokens_saved,
|
|
1367
|
+
transforms_applied,
|
|
1368
|
+
tags,
|
|
1369
|
+
optimization_latency,
|
|
1370
|
+
)
|
|
1371
|
+
else:
|
|
1372
|
+
response = await self._retry_request("POST", url, headers, body)
|
|
1373
|
+
|
|
1374
|
+
# Parse response for CCR handling
|
|
1375
|
+
resp_json = None
|
|
1376
|
+
try:
|
|
1377
|
+
resp_json = response.json()
|
|
1378
|
+
except Exception:
|
|
1379
|
+
pass
|
|
1380
|
+
|
|
1381
|
+
# CCR Response Handling: Handle headroom_retrieve tool calls automatically
|
|
1382
|
+
if (
|
|
1383
|
+
self.ccr_response_handler
|
|
1384
|
+
and resp_json
|
|
1385
|
+
and response.status_code == 200
|
|
1386
|
+
and self.ccr_response_handler.has_ccr_tool_calls(resp_json, "anthropic")
|
|
1387
|
+
):
|
|
1388
|
+
logger.info(f"[{request_id}] CCR: Detected retrieval tool call, handling...")
|
|
1389
|
+
|
|
1390
|
+
# Create API call function for continuation
|
|
1391
|
+
# Use a fresh client to avoid potential decompression state issues
|
|
1392
|
+
async def api_call_fn(
|
|
1393
|
+
msgs: list[dict], tls: list[dict] | None
|
|
1394
|
+
) -> dict[str, Any]:
|
|
1395
|
+
continuation_body = {
|
|
1396
|
+
**body,
|
|
1397
|
+
"messages": msgs,
|
|
1398
|
+
}
|
|
1399
|
+
if tls is not None:
|
|
1400
|
+
continuation_body["tools"] = tls
|
|
1401
|
+
|
|
1402
|
+
# Use clean headers for continuation
|
|
1403
|
+
continuation_headers = {
|
|
1404
|
+
k: v
|
|
1405
|
+
for k, v in headers.items()
|
|
1406
|
+
if k.lower()
|
|
1407
|
+
not in (
|
|
1408
|
+
"content-encoding",
|
|
1409
|
+
"transfer-encoding",
|
|
1410
|
+
"accept-encoding",
|
|
1411
|
+
"content-length",
|
|
1412
|
+
)
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
# Use a fresh client for CCR continuations
|
|
1416
|
+
logger.info(f"CCR: Making continuation request with {len(msgs)} messages")
|
|
1417
|
+
async with httpx.AsyncClient(
|
|
1418
|
+
timeout=httpx.Timeout(120.0),
|
|
1419
|
+
) as ccr_client:
|
|
1420
|
+
try:
|
|
1421
|
+
cont_response = await ccr_client.post(
|
|
1422
|
+
url,
|
|
1423
|
+
json=continuation_body,
|
|
1424
|
+
headers=continuation_headers,
|
|
1425
|
+
)
|
|
1426
|
+
logger.info(
|
|
1427
|
+
f"CCR: Got response status={cont_response.status_code}, "
|
|
1428
|
+
f"content-encoding={cont_response.headers.get('content-encoding')}"
|
|
1429
|
+
)
|
|
1430
|
+
result: dict[str, Any] = cont_response.json()
|
|
1431
|
+
logger.info("CCR: Parsed JSON successfully")
|
|
1432
|
+
return result
|
|
1433
|
+
except Exception as e:
|
|
1434
|
+
logger.error(
|
|
1435
|
+
f"CCR: API call failed: {e}, "
|
|
1436
|
+
f"response headers: {dict(cont_response.headers) if 'cont_response' in dir() else 'N/A'}"
|
|
1437
|
+
)
|
|
1438
|
+
raise
|
|
1439
|
+
|
|
1440
|
+
# Handle CCR tool calls
|
|
1441
|
+
try:
|
|
1442
|
+
final_resp_json = await self.ccr_response_handler.handle_response(
|
|
1443
|
+
resp_json,
|
|
1444
|
+
optimized_messages,
|
|
1445
|
+
tools,
|
|
1446
|
+
api_call_fn,
|
|
1447
|
+
provider="anthropic",
|
|
1448
|
+
)
|
|
1449
|
+
# Update response content with final response
|
|
1450
|
+
resp_json = final_resp_json
|
|
1451
|
+
# Remove encoding headers since content is now uncompressed JSON
|
|
1452
|
+
ccr_response_headers = {
|
|
1453
|
+
k: v
|
|
1454
|
+
for k, v in response.headers.items()
|
|
1455
|
+
if k.lower() not in ("content-encoding", "content-length")
|
|
1456
|
+
}
|
|
1457
|
+
response = httpx.Response(
|
|
1458
|
+
status_code=200,
|
|
1459
|
+
content=json.dumps(final_resp_json).encode(),
|
|
1460
|
+
headers=ccr_response_headers,
|
|
1461
|
+
)
|
|
1462
|
+
logger.info(f"[{request_id}] CCR: Retrieval handled successfully")
|
|
1463
|
+
except Exception as e:
|
|
1464
|
+
import traceback
|
|
1465
|
+
|
|
1466
|
+
logger.warning(
|
|
1467
|
+
f"[{request_id}] CCR: Response handling failed: {e}\n"
|
|
1468
|
+
f"Traceback: {traceback.format_exc()}"
|
|
1469
|
+
)
|
|
1470
|
+
# Continue with original response
|
|
1471
|
+
|
|
1472
|
+
total_latency = (time.time() - start_time) * 1000
|
|
1473
|
+
|
|
1474
|
+
# Parse response for output tokens
|
|
1475
|
+
output_tokens = 0
|
|
1476
|
+
if resp_json:
|
|
1477
|
+
usage = resp_json.get("usage", {})
|
|
1478
|
+
output_tokens = usage.get("output_tokens", 0)
|
|
1479
|
+
|
|
1480
|
+
# Calculate cost
|
|
1481
|
+
cost_usd = None
|
|
1482
|
+
savings_usd = None
|
|
1483
|
+
if self.cost_tracker:
|
|
1484
|
+
cost_usd = self.cost_tracker.estimate_cost(
|
|
1485
|
+
model, optimized_tokens, output_tokens
|
|
1486
|
+
)
|
|
1487
|
+
original_cost = self.cost_tracker.estimate_cost(
|
|
1488
|
+
model, original_tokens, output_tokens
|
|
1489
|
+
)
|
|
1490
|
+
if cost_usd and original_cost:
|
|
1491
|
+
savings_usd = original_cost - cost_usd
|
|
1492
|
+
self.cost_tracker.record_cost(cost_usd)
|
|
1493
|
+
self.cost_tracker.record_savings(savings_usd)
|
|
1494
|
+
|
|
1495
|
+
# Cache response
|
|
1496
|
+
if self.cache and response.status_code == 200:
|
|
1497
|
+
await self.cache.set(
|
|
1498
|
+
messages,
|
|
1499
|
+
model,
|
|
1500
|
+
response.content,
|
|
1501
|
+
dict(response.headers),
|
|
1502
|
+
tokens_saved=tokens_saved,
|
|
1503
|
+
)
|
|
1504
|
+
|
|
1505
|
+
# Record metrics
|
|
1506
|
+
await self.metrics.record_request(
|
|
1507
|
+
provider="anthropic",
|
|
1508
|
+
model=model,
|
|
1509
|
+
input_tokens=optimized_tokens,
|
|
1510
|
+
output_tokens=output_tokens,
|
|
1511
|
+
tokens_saved=tokens_saved,
|
|
1512
|
+
latency_ms=total_latency,
|
|
1513
|
+
cost_usd=cost_usd or 0,
|
|
1514
|
+
savings_usd=savings_usd or 0,
|
|
1515
|
+
)
|
|
1516
|
+
|
|
1517
|
+
# Log request
|
|
1518
|
+
if self.logger:
|
|
1519
|
+
self.logger.log(
|
|
1520
|
+
RequestLog(
|
|
1521
|
+
request_id=request_id,
|
|
1522
|
+
timestamp=datetime.now().isoformat(),
|
|
1523
|
+
provider="anthropic",
|
|
1524
|
+
model=model,
|
|
1525
|
+
input_tokens_original=original_tokens,
|
|
1526
|
+
input_tokens_optimized=optimized_tokens,
|
|
1527
|
+
output_tokens=output_tokens,
|
|
1528
|
+
tokens_saved=tokens_saved,
|
|
1529
|
+
savings_percent=(tokens_saved / original_tokens * 100)
|
|
1530
|
+
if original_tokens > 0
|
|
1531
|
+
else 0,
|
|
1532
|
+
estimated_cost_usd=cost_usd,
|
|
1533
|
+
estimated_savings_usd=savings_usd,
|
|
1534
|
+
optimization_latency_ms=optimization_latency,
|
|
1535
|
+
total_latency_ms=total_latency,
|
|
1536
|
+
tags=tags,
|
|
1537
|
+
cache_hit=cache_hit,
|
|
1538
|
+
transforms_applied=transforms_applied,
|
|
1539
|
+
request_messages=messages if self.config.log_full_messages else None,
|
|
1540
|
+
)
|
|
1541
|
+
)
|
|
1542
|
+
|
|
1543
|
+
# Log to console
|
|
1544
|
+
if tokens_saved > 0:
|
|
1545
|
+
logger.info(
|
|
1546
|
+
f"[{request_id}] {model}: {original_tokens:,} → {optimized_tokens:,} "
|
|
1547
|
+
f"(saved {tokens_saved:,} tokens, ${savings_usd:.4f})"
|
|
1548
|
+
if savings_usd
|
|
1549
|
+
else f"[{request_id}] {model}: {original_tokens:,} → {optimized_tokens:,} "
|
|
1550
|
+
f"(saved {tokens_saved:,} tokens)"
|
|
1551
|
+
)
|
|
1552
|
+
|
|
1553
|
+
# Remove compression headers since httpx already decompressed the response
|
|
1554
|
+
response_headers = dict(response.headers)
|
|
1555
|
+
response_headers.pop("content-encoding", None)
|
|
1556
|
+
response_headers.pop("content-length", None) # Length changed after decompression
|
|
1557
|
+
|
|
1558
|
+
return Response(
|
|
1559
|
+
content=response.content,
|
|
1560
|
+
status_code=response.status_code,
|
|
1561
|
+
headers=response_headers,
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
except Exception as e:
|
|
1565
|
+
await self.metrics.record_failed()
|
|
1566
|
+
# Log full error details internally for debugging
|
|
1567
|
+
logger.error(f"[{request_id}] Request failed: {type(e).__name__}: {e}")
|
|
1568
|
+
|
|
1569
|
+
# Try fallback if enabled
|
|
1570
|
+
if self.config.fallback_enabled and self.config.fallback_provider == "openai":
|
|
1571
|
+
logger.info(f"[{request_id}] Attempting fallback to OpenAI")
|
|
1572
|
+
# Convert to OpenAI format and retry
|
|
1573
|
+
# (simplified - would need message format conversion)
|
|
1574
|
+
|
|
1575
|
+
# Return sanitized error message to client (don't expose internal details)
|
|
1576
|
+
return JSONResponse(
|
|
1577
|
+
status_code=502,
|
|
1578
|
+
content={
|
|
1579
|
+
"type": "error",
|
|
1580
|
+
"error": {
|
|
1581
|
+
"type": "api_error",
|
|
1582
|
+
"message": "An error occurred while processing your request. Please try again.",
|
|
1583
|
+
},
|
|
1584
|
+
},
|
|
1585
|
+
)
|
|
1586
|
+
|
|
1587
|
+
async def _stream_response(
|
|
1588
|
+
self,
|
|
1589
|
+
url: str,
|
|
1590
|
+
headers: dict,
|
|
1591
|
+
body: dict,
|
|
1592
|
+
provider: str,
|
|
1593
|
+
model: str,
|
|
1594
|
+
request_id: str,
|
|
1595
|
+
original_tokens: int,
|
|
1596
|
+
optimized_tokens: int,
|
|
1597
|
+
tokens_saved: int,
|
|
1598
|
+
transforms_applied: list[str],
|
|
1599
|
+
tags: dict[str, str],
|
|
1600
|
+
optimization_latency: float,
|
|
1601
|
+
) -> StreamingResponse:
|
|
1602
|
+
"""Stream response with metrics tracking.
|
|
1603
|
+
|
|
1604
|
+
Calculates output size incrementally to avoid accumulating all chunks in memory.
|
|
1605
|
+
"""
|
|
1606
|
+
start_time = time.time()
|
|
1607
|
+
|
|
1608
|
+
async def generate():
|
|
1609
|
+
# Track total bytes incrementally instead of accumulating chunks
|
|
1610
|
+
total_bytes = 0
|
|
1611
|
+
try:
|
|
1612
|
+
async with self.http_client.stream(
|
|
1613
|
+
"POST", url, json=body, headers=headers
|
|
1614
|
+
) as response:
|
|
1615
|
+
async for chunk in response.aiter_bytes():
|
|
1616
|
+
total_bytes += len(chunk)
|
|
1617
|
+
yield chunk
|
|
1618
|
+
finally:
|
|
1619
|
+
# Record metrics after stream completes
|
|
1620
|
+
total_latency = (time.time() - start_time) * 1000
|
|
1621
|
+
|
|
1622
|
+
# Estimate output tokens from total bytes (rough estimate: ~4 bytes per token)
|
|
1623
|
+
output_tokens = total_bytes // 4
|
|
1624
|
+
|
|
1625
|
+
await self.metrics.record_request(
|
|
1626
|
+
provider=provider,
|
|
1627
|
+
model=model,
|
|
1628
|
+
input_tokens=optimized_tokens,
|
|
1629
|
+
output_tokens=output_tokens,
|
|
1630
|
+
tokens_saved=tokens_saved,
|
|
1631
|
+
latency_ms=total_latency,
|
|
1632
|
+
)
|
|
1633
|
+
|
|
1634
|
+
if tokens_saved > 0:
|
|
1635
|
+
logger.info(
|
|
1636
|
+
f"[{request_id}] {model}: saved {tokens_saved:,} tokens (streaming)"
|
|
1637
|
+
)
|
|
1638
|
+
|
|
1639
|
+
return StreamingResponse(
|
|
1640
|
+
generate(),
|
|
1641
|
+
media_type="text/event-stream",
|
|
1642
|
+
)
|
|
1643
|
+
|
|
1644
|
+
async def handle_openai_chat(
|
|
1645
|
+
self,
|
|
1646
|
+
request: Request,
|
|
1647
|
+
) -> Response | StreamingResponse:
|
|
1648
|
+
"""Handle OpenAI /v1/chat/completions endpoint."""
|
|
1649
|
+
start_time = time.time()
|
|
1650
|
+
request_id = await self._next_request_id()
|
|
1651
|
+
|
|
1652
|
+
# Check request body size
|
|
1653
|
+
content_length = request.headers.get("content-length")
|
|
1654
|
+
if content_length and int(content_length) > MAX_REQUEST_BODY_SIZE:
|
|
1655
|
+
return JSONResponse(
|
|
1656
|
+
status_code=413,
|
|
1657
|
+
content={
|
|
1658
|
+
"error": {
|
|
1659
|
+
"message": f"Request body too large. Maximum size is {MAX_REQUEST_BODY_SIZE // (1024 * 1024)}MB",
|
|
1660
|
+
"type": "invalid_request_error",
|
|
1661
|
+
"code": "request_too_large",
|
|
1662
|
+
}
|
|
1663
|
+
},
|
|
1664
|
+
)
|
|
1665
|
+
|
|
1666
|
+
# Parse request
|
|
1667
|
+
try:
|
|
1668
|
+
body = await request.json()
|
|
1669
|
+
except json.JSONDecodeError as e:
|
|
1670
|
+
return JSONResponse(
|
|
1671
|
+
status_code=400,
|
|
1672
|
+
content={
|
|
1673
|
+
"error": {
|
|
1674
|
+
"message": f"Invalid JSON in request body: {e!s}",
|
|
1675
|
+
"type": "invalid_request_error",
|
|
1676
|
+
"code": "invalid_json",
|
|
1677
|
+
}
|
|
1678
|
+
},
|
|
1679
|
+
)
|
|
1680
|
+
model = body.get("model", "unknown")
|
|
1681
|
+
messages = body.get("messages", [])
|
|
1682
|
+
stream = body.get("stream", False)
|
|
1683
|
+
|
|
1684
|
+
headers = dict(request.headers.items())
|
|
1685
|
+
headers.pop("host", None)
|
|
1686
|
+
headers.pop("content-length", None)
|
|
1687
|
+
tags = self._extract_tags(headers)
|
|
1688
|
+
|
|
1689
|
+
# Rate limiting
|
|
1690
|
+
if self.rate_limiter:
|
|
1691
|
+
rate_key = headers.get("authorization", "default")[:20]
|
|
1692
|
+
allowed, wait_seconds = await self.rate_limiter.check_request(rate_key)
|
|
1693
|
+
if not allowed:
|
|
1694
|
+
await self.metrics.record_rate_limited()
|
|
1695
|
+
raise HTTPException(
|
|
1696
|
+
status_code=429,
|
|
1697
|
+
detail=f"Rate limited. Retry after {wait_seconds:.1f}s",
|
|
1698
|
+
)
|
|
1699
|
+
|
|
1700
|
+
# Check cache
|
|
1701
|
+
if self.cache and not stream:
|
|
1702
|
+
cached = await self.cache.get(messages, model)
|
|
1703
|
+
if cached:
|
|
1704
|
+
await self.metrics.record_request(
|
|
1705
|
+
provider="openai",
|
|
1706
|
+
model=model,
|
|
1707
|
+
input_tokens=0,
|
|
1708
|
+
output_tokens=0,
|
|
1709
|
+
tokens_saved=cached.tokens_saved_per_hit,
|
|
1710
|
+
latency_ms=(time.time() - start_time) * 1000,
|
|
1711
|
+
cached=True,
|
|
1712
|
+
)
|
|
1713
|
+
|
|
1714
|
+
# Remove compression headers from cached response
|
|
1715
|
+
response_headers = dict(cached.response_headers)
|
|
1716
|
+
response_headers.pop("content-encoding", None)
|
|
1717
|
+
response_headers.pop("content-length", None)
|
|
1718
|
+
|
|
1719
|
+
return Response(content=cached.response_body, headers=response_headers)
|
|
1720
|
+
|
|
1721
|
+
# Token counting
|
|
1722
|
+
tokenizer = get_tokenizer(model)
|
|
1723
|
+
original_tokens = sum(tokenizer.count_text(str(m.get("content", ""))) for m in messages)
|
|
1724
|
+
|
|
1725
|
+
# Optimization
|
|
1726
|
+
transforms_applied = []
|
|
1727
|
+
optimized_messages = messages
|
|
1728
|
+
optimized_tokens = original_tokens
|
|
1729
|
+
|
|
1730
|
+
if self.config.optimize and messages:
|
|
1731
|
+
try:
|
|
1732
|
+
context_limit = self.openai_provider.get_context_limit(model)
|
|
1733
|
+
result = self.openai_pipeline.apply(
|
|
1734
|
+
messages=messages,
|
|
1735
|
+
model=model,
|
|
1736
|
+
model_limit=context_limit,
|
|
1737
|
+
)
|
|
1738
|
+
if result.messages != messages:
|
|
1739
|
+
optimized_messages = result.messages
|
|
1740
|
+
transforms_applied = result.transforms_applied
|
|
1741
|
+
optimized_tokens = sum(
|
|
1742
|
+
tokenizer.count_text(str(m.get("content", ""))) for m in optimized_messages
|
|
1743
|
+
)
|
|
1744
|
+
except Exception as e:
|
|
1745
|
+
logger.warning(f"Optimization failed: {e}")
|
|
1746
|
+
|
|
1747
|
+
tokens_saved = original_tokens - optimized_tokens
|
|
1748
|
+
optimization_latency = (time.time() - start_time) * 1000
|
|
1749
|
+
|
|
1750
|
+
# CCR Tool Injection: Inject retrieval tool if compression occurred
|
|
1751
|
+
tools = body.get("tools")
|
|
1752
|
+
if self.config.ccr_inject_tool or self.config.ccr_inject_system_instructions:
|
|
1753
|
+
injector = CCRToolInjector(
|
|
1754
|
+
provider="openai",
|
|
1755
|
+
inject_tool=self.config.ccr_inject_tool,
|
|
1756
|
+
inject_system_instructions=self.config.ccr_inject_system_instructions,
|
|
1757
|
+
)
|
|
1758
|
+
optimized_messages, tools, was_injected = injector.process_request(
|
|
1759
|
+
optimized_messages, tools
|
|
1760
|
+
)
|
|
1761
|
+
|
|
1762
|
+
if injector.has_compressed_content:
|
|
1763
|
+
if was_injected:
|
|
1764
|
+
logger.debug(
|
|
1765
|
+
f"[{request_id}] CCR: Injected retrieval tool for hashes: {injector.detected_hashes}"
|
|
1766
|
+
)
|
|
1767
|
+
else:
|
|
1768
|
+
logger.debug(
|
|
1769
|
+
f"[{request_id}] CCR: Tool already present (MCP?), skipped injection for hashes: {injector.detected_hashes}"
|
|
1770
|
+
)
|
|
1771
|
+
|
|
1772
|
+
body["messages"] = optimized_messages
|
|
1773
|
+
if tools is not None:
|
|
1774
|
+
body["tools"] = tools
|
|
1775
|
+
url = f"{self.OPENAI_API_URL}/v1/chat/completions"
|
|
1776
|
+
|
|
1777
|
+
try:
|
|
1778
|
+
if stream:
|
|
1779
|
+
return await self._stream_response(
|
|
1780
|
+
url,
|
|
1781
|
+
headers,
|
|
1782
|
+
body,
|
|
1783
|
+
"openai",
|
|
1784
|
+
model,
|
|
1785
|
+
request_id,
|
|
1786
|
+
original_tokens,
|
|
1787
|
+
optimized_tokens,
|
|
1788
|
+
tokens_saved,
|
|
1789
|
+
transforms_applied,
|
|
1790
|
+
tags,
|
|
1791
|
+
optimization_latency,
|
|
1792
|
+
)
|
|
1793
|
+
else:
|
|
1794
|
+
response = await self._retry_request("POST", url, headers, body)
|
|
1795
|
+
total_latency = (time.time() - start_time) * 1000
|
|
1796
|
+
|
|
1797
|
+
output_tokens = 0
|
|
1798
|
+
try:
|
|
1799
|
+
resp_json = response.json()
|
|
1800
|
+
usage = resp_json.get("usage", {})
|
|
1801
|
+
output_tokens = usage.get("completion_tokens", 0)
|
|
1802
|
+
except Exception:
|
|
1803
|
+
pass
|
|
1804
|
+
|
|
1805
|
+
# Cost tracking
|
|
1806
|
+
cost_usd = savings_usd = None
|
|
1807
|
+
if self.cost_tracker:
|
|
1808
|
+
cost_usd = self.cost_tracker.estimate_cost(
|
|
1809
|
+
model, optimized_tokens, output_tokens
|
|
1810
|
+
)
|
|
1811
|
+
original_cost = self.cost_tracker.estimate_cost(
|
|
1812
|
+
model, original_tokens, output_tokens
|
|
1813
|
+
)
|
|
1814
|
+
if cost_usd and original_cost:
|
|
1815
|
+
savings_usd = original_cost - cost_usd
|
|
1816
|
+
self.cost_tracker.record_cost(cost_usd)
|
|
1817
|
+
self.cost_tracker.record_savings(savings_usd)
|
|
1818
|
+
|
|
1819
|
+
# Cache
|
|
1820
|
+
if self.cache and response.status_code == 200:
|
|
1821
|
+
await self.cache.set(
|
|
1822
|
+
messages, model, response.content, dict(response.headers), tokens_saved
|
|
1823
|
+
)
|
|
1824
|
+
|
|
1825
|
+
# Metrics
|
|
1826
|
+
await self.metrics.record_request(
|
|
1827
|
+
provider="openai",
|
|
1828
|
+
model=model,
|
|
1829
|
+
input_tokens=optimized_tokens,
|
|
1830
|
+
output_tokens=output_tokens,
|
|
1831
|
+
tokens_saved=tokens_saved,
|
|
1832
|
+
latency_ms=total_latency,
|
|
1833
|
+
cost_usd=cost_usd or 0,
|
|
1834
|
+
savings_usd=savings_usd or 0,
|
|
1835
|
+
)
|
|
1836
|
+
|
|
1837
|
+
if tokens_saved > 0:
|
|
1838
|
+
logger.info(
|
|
1839
|
+
f"[{request_id}] {model}: {original_tokens:,} → {optimized_tokens:,} "
|
|
1840
|
+
f"(saved {tokens_saved:,} tokens)"
|
|
1841
|
+
)
|
|
1842
|
+
|
|
1843
|
+
# Remove compression headers since httpx already decompressed the response
|
|
1844
|
+
response_headers = dict(response.headers)
|
|
1845
|
+
response_headers.pop("content-encoding", None)
|
|
1846
|
+
response_headers.pop("content-length", None) # Length changed after decompression
|
|
1847
|
+
|
|
1848
|
+
return Response(
|
|
1849
|
+
content=response.content,
|
|
1850
|
+
status_code=response.status_code,
|
|
1851
|
+
headers=response_headers,
|
|
1852
|
+
)
|
|
1853
|
+
except Exception as e:
|
|
1854
|
+
await self.metrics.record_failed()
|
|
1855
|
+
# Log full error details internally for debugging
|
|
1856
|
+
logger.error(f"[{request_id}] OpenAI request failed: {type(e).__name__}: {e}")
|
|
1857
|
+
# Return sanitized error message to client (don't expose internal details)
|
|
1858
|
+
return JSONResponse(
|
|
1859
|
+
status_code=502,
|
|
1860
|
+
content={
|
|
1861
|
+
"error": {
|
|
1862
|
+
"message": "An error occurred while processing your request. Please try again.",
|
|
1863
|
+
"type": "server_error",
|
|
1864
|
+
"code": "proxy_error",
|
|
1865
|
+
}
|
|
1866
|
+
},
|
|
1867
|
+
)
|
|
1868
|
+
|
|
1869
|
+
async def handle_passthrough(self, request: Request, base_url: str) -> Response:
|
|
1870
|
+
"""Pass through request unchanged."""
|
|
1871
|
+
path = request.url.path
|
|
1872
|
+
url = f"{base_url}{path}"
|
|
1873
|
+
|
|
1874
|
+
headers = dict(request.headers.items())
|
|
1875
|
+
headers.pop("host", None)
|
|
1876
|
+
|
|
1877
|
+
body = await request.body()
|
|
1878
|
+
|
|
1879
|
+
response = await self.http_client.request( # type: ignore[union-attr]
|
|
1880
|
+
method=request.method,
|
|
1881
|
+
url=url,
|
|
1882
|
+
headers=headers,
|
|
1883
|
+
content=body,
|
|
1884
|
+
)
|
|
1885
|
+
|
|
1886
|
+
# Remove compression headers since httpx already decompressed the response
|
|
1887
|
+
response_headers = dict(response.headers)
|
|
1888
|
+
response_headers.pop("content-encoding", None)
|
|
1889
|
+
response_headers.pop("content-length", None) # Length changed after decompression
|
|
1890
|
+
|
|
1891
|
+
return Response(
|
|
1892
|
+
content=response.content,
|
|
1893
|
+
status_code=response.status_code,
|
|
1894
|
+
headers=response_headers,
|
|
1895
|
+
)
|
|
1896
|
+
|
|
1897
|
+
|
|
1898
|
+
# =============================================================================
|
|
1899
|
+
# FastAPI App
|
|
1900
|
+
# =============================================================================
|
|
1901
|
+
|
|
1902
|
+
|
|
1903
|
+
def create_app(config: ProxyConfig | None = None) -> FastAPI:
|
|
1904
|
+
"""Create FastAPI application."""
|
|
1905
|
+
if not FASTAPI_AVAILABLE:
|
|
1906
|
+
raise ImportError("FastAPI required. Install: pip install fastapi uvicorn httpx")
|
|
1907
|
+
|
|
1908
|
+
config = config or ProxyConfig()
|
|
1909
|
+
|
|
1910
|
+
app = FastAPI(
|
|
1911
|
+
title="Headroom Proxy",
|
|
1912
|
+
description="Production-ready LLM optimization proxy",
|
|
1913
|
+
version="1.0.0",
|
|
1914
|
+
)
|
|
1915
|
+
|
|
1916
|
+
# CORS
|
|
1917
|
+
app.add_middleware(
|
|
1918
|
+
CORSMiddleware,
|
|
1919
|
+
allow_origins=["*"],
|
|
1920
|
+
allow_credentials=True,
|
|
1921
|
+
allow_methods=["*"],
|
|
1922
|
+
allow_headers=["*"],
|
|
1923
|
+
)
|
|
1924
|
+
|
|
1925
|
+
proxy = HeadroomProxy(config)
|
|
1926
|
+
|
|
1927
|
+
@app.on_event("startup")
|
|
1928
|
+
async def startup():
|
|
1929
|
+
await proxy.startup()
|
|
1930
|
+
|
|
1931
|
+
@app.on_event("shutdown")
|
|
1932
|
+
async def shutdown():
|
|
1933
|
+
await proxy.shutdown()
|
|
1934
|
+
|
|
1935
|
+
# Health & Metrics
|
|
1936
|
+
@app.get("/health")
|
|
1937
|
+
async def health():
|
|
1938
|
+
return {
|
|
1939
|
+
"status": "healthy",
|
|
1940
|
+
"version": "1.0.0",
|
|
1941
|
+
"config": {
|
|
1942
|
+
"optimize": config.optimize,
|
|
1943
|
+
"cache": config.cache_enabled,
|
|
1944
|
+
"rate_limit": config.rate_limit_enabled,
|
|
1945
|
+
},
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1948
|
+
@app.get("/stats")
|
|
1949
|
+
async def stats():
|
|
1950
|
+
"""Get comprehensive proxy statistics.
|
|
1951
|
+
|
|
1952
|
+
This is the main stats endpoint - it aggregates data from all subsystems:
|
|
1953
|
+
- Request metrics (total, cached, failed, by model/provider)
|
|
1954
|
+
- Token usage and savings
|
|
1955
|
+
- Cost tracking
|
|
1956
|
+
- Compression (CCR) statistics
|
|
1957
|
+
- Telemetry/TOIN (data flywheel) statistics
|
|
1958
|
+
- Cache and rate limiter stats
|
|
1959
|
+
"""
|
|
1960
|
+
m = proxy.metrics
|
|
1961
|
+
|
|
1962
|
+
# Calculate average latency
|
|
1963
|
+
avg_latency_ms = round(m.latency_sum_ms / m.latency_count, 2) if m.latency_count > 0 else 0
|
|
1964
|
+
|
|
1965
|
+
# Get compression store stats
|
|
1966
|
+
store = get_compression_store()
|
|
1967
|
+
compression_stats = store.get_stats()
|
|
1968
|
+
|
|
1969
|
+
# Get telemetry/TOIN stats
|
|
1970
|
+
telemetry = get_telemetry_collector()
|
|
1971
|
+
telemetry_stats = telemetry.get_stats()
|
|
1972
|
+
|
|
1973
|
+
# Get feedback loop stats
|
|
1974
|
+
feedback = get_compression_feedback()
|
|
1975
|
+
feedback_stats = feedback.get_stats()
|
|
1976
|
+
|
|
1977
|
+
# Calculate total tokens before compression
|
|
1978
|
+
total_tokens_before = m.tokens_input_total + m.tokens_saved_total
|
|
1979
|
+
|
|
1980
|
+
return {
|
|
1981
|
+
"requests": {
|
|
1982
|
+
"total": m.requests_total,
|
|
1983
|
+
"cached": m.requests_cached,
|
|
1984
|
+
"rate_limited": m.requests_rate_limited,
|
|
1985
|
+
"failed": m.requests_failed,
|
|
1986
|
+
"by_provider": dict(m.requests_by_provider),
|
|
1987
|
+
"by_model": dict(m.requests_by_model),
|
|
1988
|
+
},
|
|
1989
|
+
"tokens": {
|
|
1990
|
+
"input": m.tokens_input_total,
|
|
1991
|
+
"output": m.tokens_output_total,
|
|
1992
|
+
"saved": m.tokens_saved_total,
|
|
1993
|
+
"total_before_compression": total_tokens_before,
|
|
1994
|
+
"savings_percent": round(
|
|
1995
|
+
(m.tokens_saved_total / total_tokens_before * 100)
|
|
1996
|
+
if total_tokens_before > 0
|
|
1997
|
+
else 0,
|
|
1998
|
+
2,
|
|
1999
|
+
),
|
|
2000
|
+
},
|
|
2001
|
+
"latency": {
|
|
2002
|
+
"average_ms": avg_latency_ms,
|
|
2003
|
+
"total_requests": m.latency_count,
|
|
2004
|
+
},
|
|
2005
|
+
"cost": proxy.cost_tracker.stats() if proxy.cost_tracker else None,
|
|
2006
|
+
"compression": {
|
|
2007
|
+
"ccr_entries": compression_stats.get("entry_count", 0),
|
|
2008
|
+
"ccr_max_entries": compression_stats.get("max_entries", 0),
|
|
2009
|
+
"original_tokens_cached": compression_stats.get("total_original_tokens", 0),
|
|
2010
|
+
"compressed_tokens_cached": compression_stats.get("total_compressed_tokens", 0),
|
|
2011
|
+
"ccr_retrievals": compression_stats.get("total_retrievals", 0),
|
|
2012
|
+
},
|
|
2013
|
+
"telemetry": {
|
|
2014
|
+
"enabled": telemetry_stats.get("enabled", False),
|
|
2015
|
+
"total_compressions": telemetry_stats.get("total_compressions", 0),
|
|
2016
|
+
"total_retrievals": telemetry_stats.get("total_retrievals", 0),
|
|
2017
|
+
"global_retrieval_rate": round(telemetry_stats.get("global_retrieval_rate", 0), 4),
|
|
2018
|
+
"tool_signatures_tracked": telemetry_stats.get("tool_signatures_tracked", 0),
|
|
2019
|
+
"avg_compression_ratio": round(telemetry_stats.get("avg_compression_ratio", 0), 4),
|
|
2020
|
+
"avg_token_reduction": round(telemetry_stats.get("avg_token_reduction", 0), 4),
|
|
2021
|
+
},
|
|
2022
|
+
"feedback_loop": {
|
|
2023
|
+
"tools_tracked": feedback_stats.get("tools_tracked", 0),
|
|
2024
|
+
"total_compressions": feedback_stats.get("total_compressions", 0),
|
|
2025
|
+
"total_retrievals": feedback_stats.get("total_retrievals", 0),
|
|
2026
|
+
"global_retrieval_rate": round(feedback_stats.get("global_retrieval_rate", 0), 4),
|
|
2027
|
+
"tools_with_high_retrieval": sum(
|
|
2028
|
+
1
|
|
2029
|
+
for p in feedback_stats.get("tool_patterns", {}).values()
|
|
2030
|
+
if p.get("retrieval_rate", 0) > 0.3
|
|
2031
|
+
),
|
|
2032
|
+
},
|
|
2033
|
+
"cache": await proxy.cache.stats() if proxy.cache else None,
|
|
2034
|
+
"rate_limiter": await proxy.rate_limiter.stats() if proxy.rate_limiter else None,
|
|
2035
|
+
"recent_requests": proxy.logger.get_recent(10) if proxy.logger else [],
|
|
2036
|
+
}
|
|
2037
|
+
|
|
2038
|
+
@app.get("/metrics")
|
|
2039
|
+
async def metrics():
|
|
2040
|
+
"""Prometheus metrics endpoint."""
|
|
2041
|
+
return PlainTextResponse(
|
|
2042
|
+
await proxy.metrics.export(),
|
|
2043
|
+
media_type="text/plain; version=0.0.4",
|
|
2044
|
+
)
|
|
2045
|
+
|
|
2046
|
+
@app.post("/cache/clear")
|
|
2047
|
+
async def clear_cache():
|
|
2048
|
+
"""Clear the response cache."""
|
|
2049
|
+
if proxy.cache:
|
|
2050
|
+
await proxy.cache.clear()
|
|
2051
|
+
return {"status": "cleared"}
|
|
2052
|
+
return {"status": "cache disabled"}
|
|
2053
|
+
|
|
2054
|
+
# CCR (Compress-Cache-Retrieve) endpoints
|
|
2055
|
+
@app.post("/v1/retrieve")
|
|
2056
|
+
async def ccr_retrieve(request: Request):
|
|
2057
|
+
"""Retrieve original content from CCR compression cache.
|
|
2058
|
+
|
|
2059
|
+
This is the "Retrieve" part of CCR (Compress-Cache-Retrieve).
|
|
2060
|
+
When SmartCrusher compresses tool outputs, the original data is cached.
|
|
2061
|
+
LLMs can call this endpoint to get more data if needed.
|
|
2062
|
+
|
|
2063
|
+
Request body:
|
|
2064
|
+
hash (str): Hash key from compression marker (required)
|
|
2065
|
+
query (str): Optional search query to filter results
|
|
2066
|
+
|
|
2067
|
+
Response:
|
|
2068
|
+
Full retrieval: {"hash": "...", "original_content": "...", ...}
|
|
2069
|
+
Search: {"hash": "...", "query": "...", "results": [...], "count": N}
|
|
2070
|
+
"""
|
|
2071
|
+
data = await request.json()
|
|
2072
|
+
hash_key = data.get("hash")
|
|
2073
|
+
query = data.get("query")
|
|
2074
|
+
|
|
2075
|
+
if not hash_key:
|
|
2076
|
+
raise HTTPException(status_code=400, detail="hash required")
|
|
2077
|
+
|
|
2078
|
+
store = get_compression_store()
|
|
2079
|
+
|
|
2080
|
+
if query:
|
|
2081
|
+
# Search within cached content
|
|
2082
|
+
results = store.search(hash_key, query)
|
|
2083
|
+
return {
|
|
2084
|
+
"hash": hash_key,
|
|
2085
|
+
"query": query,
|
|
2086
|
+
"results": results,
|
|
2087
|
+
"count": len(results),
|
|
2088
|
+
}
|
|
2089
|
+
else:
|
|
2090
|
+
# Return full original content
|
|
2091
|
+
entry = store.retrieve(hash_key)
|
|
2092
|
+
if entry:
|
|
2093
|
+
return {
|
|
2094
|
+
"hash": hash_key,
|
|
2095
|
+
"original_content": entry.original_content,
|
|
2096
|
+
"original_tokens": entry.original_tokens,
|
|
2097
|
+
"original_item_count": entry.original_item_count,
|
|
2098
|
+
"compressed_item_count": entry.compressed_item_count,
|
|
2099
|
+
"tool_name": entry.tool_name,
|
|
2100
|
+
"retrieval_count": entry.retrieval_count,
|
|
2101
|
+
}
|
|
2102
|
+
raise HTTPException(
|
|
2103
|
+
status_code=404, detail="Entry not found or expired (TTL: 5 minutes)"
|
|
2104
|
+
)
|
|
2105
|
+
|
|
2106
|
+
@app.get("/v1/retrieve/stats")
|
|
2107
|
+
async def ccr_stats():
|
|
2108
|
+
"""Get CCR compression store statistics."""
|
|
2109
|
+
store = get_compression_store()
|
|
2110
|
+
stats = store.get_stats()
|
|
2111
|
+
events = store.get_retrieval_events(limit=20)
|
|
2112
|
+
return {
|
|
2113
|
+
"store": stats,
|
|
2114
|
+
"recent_retrievals": [
|
|
2115
|
+
{
|
|
2116
|
+
"hash": e.hash,
|
|
2117
|
+
"query": e.query,
|
|
2118
|
+
"items_retrieved": e.items_retrieved,
|
|
2119
|
+
"total_items": e.total_items,
|
|
2120
|
+
"tool_name": e.tool_name,
|
|
2121
|
+
"retrieval_type": e.retrieval_type,
|
|
2122
|
+
}
|
|
2123
|
+
for e in events
|
|
2124
|
+
],
|
|
2125
|
+
}
|
|
2126
|
+
|
|
2127
|
+
@app.get("/v1/feedback")
|
|
2128
|
+
async def ccr_feedback():
|
|
2129
|
+
"""Get CCR feedback loop statistics and learned patterns.
|
|
2130
|
+
|
|
2131
|
+
This endpoint exposes the feedback loop's learned patterns for monitoring
|
|
2132
|
+
and debugging. It shows:
|
|
2133
|
+
- Per-tool retrieval rates (high = compress less aggressively)
|
|
2134
|
+
- Common search queries per tool
|
|
2135
|
+
- Queried fields (suggest what to preserve)
|
|
2136
|
+
|
|
2137
|
+
Use this to understand how well compression is working and whether
|
|
2138
|
+
the feedback loop is adjusting appropriately.
|
|
2139
|
+
"""
|
|
2140
|
+
feedback = get_compression_feedback()
|
|
2141
|
+
stats = feedback.get_stats()
|
|
2142
|
+
return {
|
|
2143
|
+
"feedback": stats,
|
|
2144
|
+
"hints_example": {
|
|
2145
|
+
tool_name: {
|
|
2146
|
+
"hints": {
|
|
2147
|
+
"max_items": hints.max_items
|
|
2148
|
+
if (hints := feedback.get_compression_hints(tool_name))
|
|
2149
|
+
else 15,
|
|
2150
|
+
"suggested_items": hints.suggested_items if hints else None,
|
|
2151
|
+
"skip_compression": hints.skip_compression if hints else False,
|
|
2152
|
+
"preserve_fields": hints.preserve_fields if hints else [],
|
|
2153
|
+
"reason": hints.reason if hints else "",
|
|
2154
|
+
}
|
|
2155
|
+
}
|
|
2156
|
+
for tool_name in list(stats.get("tool_patterns", {}).keys())[:5]
|
|
2157
|
+
},
|
|
2158
|
+
}
|
|
2159
|
+
|
|
2160
|
+
@app.get("/v1/feedback/{tool_name}")
|
|
2161
|
+
async def ccr_feedback_for_tool(tool_name: str):
|
|
2162
|
+
"""Get compression hints for a specific tool.
|
|
2163
|
+
|
|
2164
|
+
Returns feedback-based hints that would be used for compressing
|
|
2165
|
+
this tool's output.
|
|
2166
|
+
"""
|
|
2167
|
+
feedback = get_compression_feedback()
|
|
2168
|
+
hints = feedback.get_compression_hints(tool_name)
|
|
2169
|
+
patterns = feedback.get_all_patterns().get(tool_name)
|
|
2170
|
+
|
|
2171
|
+
return {
|
|
2172
|
+
"tool_name": tool_name,
|
|
2173
|
+
"hints": {
|
|
2174
|
+
"max_items": hints.max_items,
|
|
2175
|
+
"min_items": hints.min_items,
|
|
2176
|
+
"suggested_items": hints.suggested_items,
|
|
2177
|
+
"aggressiveness": hints.aggressiveness,
|
|
2178
|
+
"skip_compression": hints.skip_compression,
|
|
2179
|
+
"preserve_fields": hints.preserve_fields,
|
|
2180
|
+
"reason": hints.reason,
|
|
2181
|
+
},
|
|
2182
|
+
"pattern": {
|
|
2183
|
+
"total_compressions": patterns.total_compressions if patterns else 0,
|
|
2184
|
+
"total_retrievals": patterns.total_retrievals if patterns else 0,
|
|
2185
|
+
"retrieval_rate": patterns.retrieval_rate if patterns else 0.0,
|
|
2186
|
+
"full_retrieval_rate": patterns.full_retrieval_rate if patterns else 0.0,
|
|
2187
|
+
"search_rate": patterns.search_rate if patterns else 0.0,
|
|
2188
|
+
"common_queries": list(patterns.common_queries.keys())[:10] if patterns else [],
|
|
2189
|
+
"queried_fields": list(patterns.queried_fields.keys())[:10] if patterns else [],
|
|
2190
|
+
}
|
|
2191
|
+
if patterns
|
|
2192
|
+
else None,
|
|
2193
|
+
}
|
|
2194
|
+
|
|
2195
|
+
# Telemetry endpoints (Data Flywheel)
|
|
2196
|
+
@app.get("/v1/telemetry")
|
|
2197
|
+
async def telemetry_stats():
|
|
2198
|
+
"""Get telemetry statistics for the data flywheel.
|
|
2199
|
+
|
|
2200
|
+
This endpoint exposes privacy-preserving telemetry data that powers
|
|
2201
|
+
the data flywheel - learning optimal compression strategies across
|
|
2202
|
+
tool types based on usage patterns.
|
|
2203
|
+
|
|
2204
|
+
What's collected (anonymized):
|
|
2205
|
+
- Tool output structure patterns (field types, not values)
|
|
2206
|
+
- Compression decisions and ratios
|
|
2207
|
+
- Retrieval patterns (rate, type, not content)
|
|
2208
|
+
- Strategy effectiveness
|
|
2209
|
+
|
|
2210
|
+
What's NOT collected:
|
|
2211
|
+
- Actual data values
|
|
2212
|
+
- User identifiers
|
|
2213
|
+
- Queries or search terms
|
|
2214
|
+
- File paths or tool names (hashed by default)
|
|
2215
|
+
"""
|
|
2216
|
+
telemetry = get_telemetry_collector()
|
|
2217
|
+
return telemetry.get_stats()
|
|
2218
|
+
|
|
2219
|
+
@app.get("/v1/telemetry/export")
|
|
2220
|
+
async def telemetry_export():
|
|
2221
|
+
"""Export full telemetry data for aggregation.
|
|
2222
|
+
|
|
2223
|
+
This endpoint exports all telemetry data in a format suitable for
|
|
2224
|
+
cross-user aggregation. The data is privacy-preserving - no actual
|
|
2225
|
+
values are included, only structural patterns and statistics.
|
|
2226
|
+
|
|
2227
|
+
Use this for:
|
|
2228
|
+
- Building a central learning service
|
|
2229
|
+
- Sharing learned patterns across instances
|
|
2230
|
+
- Analysis and debugging
|
|
2231
|
+
"""
|
|
2232
|
+
telemetry = get_telemetry_collector()
|
|
2233
|
+
return telemetry.export_stats()
|
|
2234
|
+
|
|
2235
|
+
@app.post("/v1/telemetry/import")
|
|
2236
|
+
async def telemetry_import(request: Request):
|
|
2237
|
+
"""Import telemetry data from another source.
|
|
2238
|
+
|
|
2239
|
+
This allows merging telemetry from multiple sources for cross-user
|
|
2240
|
+
learning. The imported data is merged with existing statistics.
|
|
2241
|
+
|
|
2242
|
+
Request body: Telemetry export data from /v1/telemetry/export
|
|
2243
|
+
"""
|
|
2244
|
+
telemetry = get_telemetry_collector()
|
|
2245
|
+
data = await request.json()
|
|
2246
|
+
telemetry.import_stats(data)
|
|
2247
|
+
return {"status": "imported", "current_stats": telemetry.get_stats()}
|
|
2248
|
+
|
|
2249
|
+
@app.get("/v1/telemetry/tools")
|
|
2250
|
+
async def telemetry_tools():
|
|
2251
|
+
"""Get telemetry statistics for all tracked tool signatures.
|
|
2252
|
+
|
|
2253
|
+
Returns statistics per tool signature (anonymized), including:
|
|
2254
|
+
- Compression ratios and strategy usage
|
|
2255
|
+
- Retrieval rates (high = compression too aggressive)
|
|
2256
|
+
- Learned recommendations
|
|
2257
|
+
"""
|
|
2258
|
+
telemetry = get_telemetry_collector()
|
|
2259
|
+
all_stats = telemetry.get_all_tool_stats()
|
|
2260
|
+
return {
|
|
2261
|
+
"tool_count": len(all_stats),
|
|
2262
|
+
"tools": {sig_hash: stats.to_dict() for sig_hash, stats in all_stats.items()},
|
|
2263
|
+
}
|
|
2264
|
+
|
|
2265
|
+
@app.get("/v1/telemetry/tools/{signature_hash}")
|
|
2266
|
+
async def telemetry_tool_detail(signature_hash: str):
|
|
2267
|
+
"""Get detailed telemetry for a specific tool signature.
|
|
2268
|
+
|
|
2269
|
+
Includes learned recommendations if enough data has been collected.
|
|
2270
|
+
"""
|
|
2271
|
+
telemetry = get_telemetry_collector()
|
|
2272
|
+
stats = telemetry.get_tool_stats(signature_hash)
|
|
2273
|
+
recommendations = telemetry.get_recommendations(signature_hash)
|
|
2274
|
+
|
|
2275
|
+
if stats is None:
|
|
2276
|
+
raise HTTPException(
|
|
2277
|
+
status_code=404, detail=f"No telemetry found for signature: {signature_hash}"
|
|
2278
|
+
)
|
|
2279
|
+
|
|
2280
|
+
return {
|
|
2281
|
+
"signature_hash": signature_hash,
|
|
2282
|
+
"stats": stats.to_dict(),
|
|
2283
|
+
"recommendations": recommendations,
|
|
2284
|
+
}
|
|
2285
|
+
|
|
2286
|
+
@app.get("/v1/retrieve/{hash_key}")
|
|
2287
|
+
async def ccr_retrieve_get(hash_key: str, query: str | None = None):
|
|
2288
|
+
"""GET version of CCR retrieve for easier testing."""
|
|
2289
|
+
store = get_compression_store()
|
|
2290
|
+
|
|
2291
|
+
if query:
|
|
2292
|
+
results = store.search(hash_key, query)
|
|
2293
|
+
return {
|
|
2294
|
+
"hash": hash_key,
|
|
2295
|
+
"query": query,
|
|
2296
|
+
"results": results,
|
|
2297
|
+
"count": len(results),
|
|
2298
|
+
}
|
|
2299
|
+
else:
|
|
2300
|
+
entry = store.retrieve(hash_key)
|
|
2301
|
+
if entry:
|
|
2302
|
+
return {
|
|
2303
|
+
"hash": hash_key,
|
|
2304
|
+
"original_content": entry.original_content,
|
|
2305
|
+
"original_tokens": entry.original_tokens,
|
|
2306
|
+
"original_item_count": entry.original_item_count,
|
|
2307
|
+
"compressed_item_count": entry.compressed_item_count,
|
|
2308
|
+
"tool_name": entry.tool_name,
|
|
2309
|
+
"retrieval_count": entry.retrieval_count,
|
|
2310
|
+
}
|
|
2311
|
+
raise HTTPException(status_code=404, detail="Entry not found or expired")
|
|
2312
|
+
|
|
2313
|
+
# CCR Tool Call Handler - for agent frameworks to call when LLM uses headroom_retrieve
|
|
2314
|
+
@app.post("/v1/retrieve/tool_call")
|
|
2315
|
+
async def ccr_handle_tool_call(request: Request):
|
|
2316
|
+
"""Handle a CCR tool call from an LLM response.
|
|
2317
|
+
|
|
2318
|
+
This endpoint accepts tool call formats from various providers and returns
|
|
2319
|
+
a properly formatted tool result. Agent frameworks can use this to handle
|
|
2320
|
+
CCR tool calls without implementing the retrieval logic themselves.
|
|
2321
|
+
|
|
2322
|
+
Request body (Anthropic format):
|
|
2323
|
+
{
|
|
2324
|
+
"tool_call": {
|
|
2325
|
+
"id": "toolu_123",
|
|
2326
|
+
"name": "headroom_retrieve",
|
|
2327
|
+
"input": {"hash": "abc123", "query": "optional search"}
|
|
2328
|
+
},
|
|
2329
|
+
"provider": "anthropic"
|
|
2330
|
+
}
|
|
2331
|
+
|
|
2332
|
+
Request body (OpenAI format):
|
|
2333
|
+
{
|
|
2334
|
+
"tool_call": {
|
|
2335
|
+
"id": "call_123",
|
|
2336
|
+
"function": {
|
|
2337
|
+
"name": "headroom_retrieve",
|
|
2338
|
+
"arguments": "{\"hash\": \"abc123\"}"
|
|
2339
|
+
}
|
|
2340
|
+
},
|
|
2341
|
+
"provider": "openai"
|
|
2342
|
+
}
|
|
2343
|
+
|
|
2344
|
+
Response:
|
|
2345
|
+
{
|
|
2346
|
+
"tool_result": {...}, # Formatted for the provider
|
|
2347
|
+
"success": true,
|
|
2348
|
+
"data": {...} # Raw retrieval data
|
|
2349
|
+
}
|
|
2350
|
+
"""
|
|
2351
|
+
data = await request.json()
|
|
2352
|
+
tool_call = data.get("tool_call", {})
|
|
2353
|
+
provider = data.get("provider", "anthropic")
|
|
2354
|
+
|
|
2355
|
+
# Parse the tool call
|
|
2356
|
+
hash_key, query = parse_tool_call(tool_call, provider)
|
|
2357
|
+
|
|
2358
|
+
if hash_key is None:
|
|
2359
|
+
raise HTTPException(
|
|
2360
|
+
status_code=400, detail=f"Invalid tool call or not a {CCR_TOOL_NAME} call"
|
|
2361
|
+
)
|
|
2362
|
+
|
|
2363
|
+
# Perform retrieval
|
|
2364
|
+
store = get_compression_store()
|
|
2365
|
+
|
|
2366
|
+
if query:
|
|
2367
|
+
results = store.search(hash_key, query)
|
|
2368
|
+
retrieval_data = {
|
|
2369
|
+
"hash": hash_key,
|
|
2370
|
+
"query": query,
|
|
2371
|
+
"results": results,
|
|
2372
|
+
"count": len(results),
|
|
2373
|
+
}
|
|
2374
|
+
else:
|
|
2375
|
+
entry = store.retrieve(hash_key)
|
|
2376
|
+
if entry:
|
|
2377
|
+
retrieval_data = {
|
|
2378
|
+
"hash": hash_key,
|
|
2379
|
+
"original_content": entry.original_content,
|
|
2380
|
+
"original_item_count": entry.original_item_count,
|
|
2381
|
+
"compressed_item_count": entry.compressed_item_count,
|
|
2382
|
+
}
|
|
2383
|
+
else:
|
|
2384
|
+
retrieval_data = {
|
|
2385
|
+
"error": "Entry not found or expired (TTL: 5 minutes)",
|
|
2386
|
+
"hash": hash_key,
|
|
2387
|
+
}
|
|
2388
|
+
|
|
2389
|
+
# Format tool result for provider
|
|
2390
|
+
tool_call_id = tool_call.get("id", "")
|
|
2391
|
+
result_content = json.dumps(retrieval_data, indent=2)
|
|
2392
|
+
|
|
2393
|
+
if provider == "anthropic":
|
|
2394
|
+
tool_result = {
|
|
2395
|
+
"type": "tool_result",
|
|
2396
|
+
"tool_use_id": tool_call_id,
|
|
2397
|
+
"content": result_content,
|
|
2398
|
+
}
|
|
2399
|
+
elif provider == "openai":
|
|
2400
|
+
tool_result = {
|
|
2401
|
+
"role": "tool",
|
|
2402
|
+
"tool_call_id": tool_call_id,
|
|
2403
|
+
"content": result_content,
|
|
2404
|
+
}
|
|
2405
|
+
else:
|
|
2406
|
+
tool_result = {
|
|
2407
|
+
"tool_call_id": tool_call_id,
|
|
2408
|
+
"content": result_content,
|
|
2409
|
+
}
|
|
2410
|
+
|
|
2411
|
+
return {
|
|
2412
|
+
"tool_result": tool_result,
|
|
2413
|
+
"success": "error" not in retrieval_data,
|
|
2414
|
+
"data": retrieval_data,
|
|
2415
|
+
}
|
|
2416
|
+
|
|
2417
|
+
# Anthropic endpoints
|
|
2418
|
+
@app.post("/v1/messages")
|
|
2419
|
+
async def anthropic_messages(request: Request):
|
|
2420
|
+
return await proxy.handle_anthropic_messages(request)
|
|
2421
|
+
|
|
2422
|
+
@app.post("/v1/messages/count_tokens")
|
|
2423
|
+
async def anthropic_count_tokens(request: Request):
|
|
2424
|
+
return await proxy.handle_passthrough(request, proxy.ANTHROPIC_API_URL)
|
|
2425
|
+
|
|
2426
|
+
# OpenAI endpoints
|
|
2427
|
+
@app.post("/v1/chat/completions")
|
|
2428
|
+
async def openai_chat(request: Request):
|
|
2429
|
+
return await proxy.handle_openai_chat(request)
|
|
2430
|
+
|
|
2431
|
+
# Passthrough - route to correct backend based on headers
|
|
2432
|
+
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
|
|
2433
|
+
async def passthrough(request: Request, path: str):
|
|
2434
|
+
# Anthropic SDK always sends anthropic-version header and uses x-api-key for auth
|
|
2435
|
+
# OpenAI SDK uses Authorization: Bearer for auth
|
|
2436
|
+
if request.headers.get("anthropic-version") or request.headers.get("x-api-key"):
|
|
2437
|
+
base_url = proxy.ANTHROPIC_API_URL
|
|
2438
|
+
else:
|
|
2439
|
+
base_url = proxy.OPENAI_API_URL
|
|
2440
|
+
return await proxy.handle_passthrough(request, base_url)
|
|
2441
|
+
|
|
2442
|
+
return app
|
|
2443
|
+
|
|
2444
|
+
|
|
2445
|
+
def _get_llmlingua_banner_status(config: ProxyConfig) -> str:
|
|
2446
|
+
"""Get LLMLingua status line for banner."""
|
|
2447
|
+
if config.llmlingua_enabled:
|
|
2448
|
+
if _LLMLINGUA_AVAILABLE:
|
|
2449
|
+
return (
|
|
2450
|
+
f"ENABLED (device={config.llmlingua_device}, rate={config.llmlingua_target_rate})"
|
|
2451
|
+
)
|
|
2452
|
+
else:
|
|
2453
|
+
return "NOT INSTALLED (pip install headroom-ai[llmlingua])"
|
|
2454
|
+
else:
|
|
2455
|
+
if _LLMLINGUA_AVAILABLE:
|
|
2456
|
+
return "DISABLED (remove --no-llmlingua to enable)"
|
|
2457
|
+
return "DISABLED"
|
|
2458
|
+
|
|
2459
|
+
|
|
2460
|
+
def _get_code_aware_banner_status(config: ProxyConfig) -> str:
|
|
2461
|
+
"""Get code-aware compression status line for banner."""
|
|
2462
|
+
if config.code_aware_enabled:
|
|
2463
|
+
if is_tree_sitter_available():
|
|
2464
|
+
return "ENABLED (AST-based)"
|
|
2465
|
+
else:
|
|
2466
|
+
return "NOT INSTALLED (pip install headroom-ai[code])"
|
|
2467
|
+
else:
|
|
2468
|
+
if is_tree_sitter_available():
|
|
2469
|
+
return "DISABLED (remove --no-code-aware to enable)"
|
|
2470
|
+
return "DISABLED"
|
|
2471
|
+
|
|
2472
|
+
|
|
2473
|
+
def run_server(config: ProxyConfig | None = None):
|
|
2474
|
+
"""Run the proxy server."""
|
|
2475
|
+
if not FASTAPI_AVAILABLE:
|
|
2476
|
+
print("ERROR: FastAPI required. Install: pip install fastapi uvicorn httpx")
|
|
2477
|
+
sys.exit(1)
|
|
2478
|
+
|
|
2479
|
+
config = config or ProxyConfig()
|
|
2480
|
+
app = create_app(config)
|
|
2481
|
+
|
|
2482
|
+
llmlingua_status = _get_llmlingua_banner_status(config)
|
|
2483
|
+
code_aware_status = _get_code_aware_banner_status(config)
|
|
2484
|
+
|
|
2485
|
+
print(f"""
|
|
2486
|
+
╔══════════════════════════════════════════════════════════════════════╗
|
|
2487
|
+
║ HEADROOM PROXY SERVER ║
|
|
2488
|
+
╠══════════════════════════════════════════════════════════════════════╣
|
|
2489
|
+
║ Version: 1.0.0 ║
|
|
2490
|
+
║ Listening: http://{config.host}:{config.port:<5} ║
|
|
2491
|
+
╠══════════════════════════════════════════════════════════════════════╣
|
|
2492
|
+
║ FEATURES: ║
|
|
2493
|
+
║ Optimization: {"ENABLED " if config.optimize else "DISABLED"} ║
|
|
2494
|
+
║ Caching: {"ENABLED " if config.cache_enabled else "DISABLED"} (TTL: {config.cache_ttl_seconds}s) ║
|
|
2495
|
+
║ Rate Limiting: {"ENABLED " if config.rate_limit_enabled else "DISABLED"} ({config.rate_limit_requests_per_minute} req/min, {config.rate_limit_tokens_per_minute:,} tok/min) ║
|
|
2496
|
+
║ Retry: {"ENABLED " if config.retry_enabled else "DISABLED"} (max {config.retry_max_attempts} attempts) ║
|
|
2497
|
+
║ Cost Tracking: {"ENABLED " if config.cost_tracking_enabled else "DISABLED"} (budget: {"$" + str(config.budget_limit_usd) + "/" + config.budget_period if config.budget_limit_usd else "unlimited"}) ║
|
|
2498
|
+
║ LLMLingua: {llmlingua_status:<52}║
|
|
2499
|
+
║ Code-Aware: {code_aware_status:<52}║
|
|
2500
|
+
╠══════════════════════════════════════════════════════════════════════╣
|
|
2501
|
+
║ USAGE: ║
|
|
2502
|
+
║ Claude Code: ANTHROPIC_BASE_URL=http://{config.host}:{config.port} claude ║
|
|
2503
|
+
║ Cursor: Set base URL in settings ║
|
|
2504
|
+
╠══════════════════════════════════════════════════════════════════════╣
|
|
2505
|
+
║ ENDPOINTS: ║
|
|
2506
|
+
║ /health Health check ║
|
|
2507
|
+
║ /stats Detailed statistics ║
|
|
2508
|
+
║ /metrics Prometheus metrics ║
|
|
2509
|
+
║ /cache/clear Clear response cache ║
|
|
2510
|
+
║ /v1/retrieve CCR: Retrieve compressed content ║
|
|
2511
|
+
║ /v1/retrieve/stats CCR: Compression store stats ║
|
|
2512
|
+
║ /v1/retrieve/tool_call CCR: Handle LLM tool calls ║
|
|
2513
|
+
║ /v1/feedback CCR: Feedback loop stats & patterns ║
|
|
2514
|
+
║ /v1/feedback/{{tool}} CCR: Compression hints for a tool ║
|
|
2515
|
+
║ /v1/telemetry Data flywheel: Telemetry stats ║
|
|
2516
|
+
║ /v1/telemetry/export Data flywheel: Export for aggregation ║
|
|
2517
|
+
║ /v1/telemetry/tools Data flywheel: Per-tool stats ║
|
|
2518
|
+
╚══════════════════════════════════════════════════════════════════════╝
|
|
2519
|
+
""")
|
|
2520
|
+
|
|
2521
|
+
uvicorn.run(app, host=config.host, port=config.port, log_level="warning")
|
|
2522
|
+
|
|
2523
|
+
|
|
2524
|
+
def _get_env_bool(name: str, default: bool) -> bool:
|
|
2525
|
+
"""Get boolean from environment variable."""
|
|
2526
|
+
val = os.environ.get(name)
|
|
2527
|
+
if val is None:
|
|
2528
|
+
return default
|
|
2529
|
+
return val.lower() in ("true", "1", "yes", "on")
|
|
2530
|
+
|
|
2531
|
+
|
|
2532
|
+
def _get_env_int(name: str, default: int) -> int:
|
|
2533
|
+
"""Get integer from environment variable."""
|
|
2534
|
+
val = os.environ.get(name)
|
|
2535
|
+
if val is None:
|
|
2536
|
+
return default
|
|
2537
|
+
try:
|
|
2538
|
+
return int(val)
|
|
2539
|
+
except ValueError:
|
|
2540
|
+
return default
|
|
2541
|
+
|
|
2542
|
+
|
|
2543
|
+
def _get_env_float(name: str, default: float) -> float:
|
|
2544
|
+
"""Get float from environment variable."""
|
|
2545
|
+
val = os.environ.get(name)
|
|
2546
|
+
if val is None:
|
|
2547
|
+
return default
|
|
2548
|
+
try:
|
|
2549
|
+
return float(val)
|
|
2550
|
+
except ValueError:
|
|
2551
|
+
return default
|
|
2552
|
+
|
|
2553
|
+
|
|
2554
|
+
def _get_env_str(name: str, default: str) -> str:
|
|
2555
|
+
"""Get string from environment variable."""
|
|
2556
|
+
return os.environ.get(name, default)
|
|
2557
|
+
|
|
2558
|
+
|
|
2559
|
+
if __name__ == "__main__":
|
|
2560
|
+
parser = argparse.ArgumentParser(description="Headroom Proxy Server")
|
|
2561
|
+
|
|
2562
|
+
# Server
|
|
2563
|
+
parser.add_argument("--host", default="127.0.0.1")
|
|
2564
|
+
parser.add_argument("--port", type=int, default=8787)
|
|
2565
|
+
|
|
2566
|
+
# Optimization
|
|
2567
|
+
parser.add_argument("--no-optimize", action="store_true", help="Disable optimization")
|
|
2568
|
+
parser.add_argument("--min-tokens", type=int, default=500, help="Min tokens to crush")
|
|
2569
|
+
parser.add_argument("--max-items", type=int, default=50, help="Max items after crush")
|
|
2570
|
+
|
|
2571
|
+
# Caching
|
|
2572
|
+
parser.add_argument("--no-cache", action="store_true", help="Disable caching")
|
|
2573
|
+
parser.add_argument("--cache-ttl", type=int, default=3600, help="Cache TTL seconds")
|
|
2574
|
+
|
|
2575
|
+
# Rate limiting
|
|
2576
|
+
parser.add_argument("--no-rate-limit", action="store_true", help="Disable rate limiting")
|
|
2577
|
+
parser.add_argument("--rpm", type=int, default=60, help="Requests per minute")
|
|
2578
|
+
parser.add_argument("--tpm", type=int, default=100000, help="Tokens per minute")
|
|
2579
|
+
|
|
2580
|
+
# Cost
|
|
2581
|
+
parser.add_argument("--budget", type=float, help="Budget limit in USD")
|
|
2582
|
+
parser.add_argument("--budget-period", choices=["hourly", "daily", "monthly"], default="daily")
|
|
2583
|
+
|
|
2584
|
+
# Logging
|
|
2585
|
+
parser.add_argument("--log-file", help="Log file path")
|
|
2586
|
+
parser.add_argument("--log-messages", action="store_true", help="Log full messages")
|
|
2587
|
+
|
|
2588
|
+
# Smart routing (content-aware compression)
|
|
2589
|
+
parser.add_argument(
|
|
2590
|
+
"--no-smart-routing",
|
|
2591
|
+
action="store_true",
|
|
2592
|
+
help="Disable smart routing (use legacy sequential pipeline)",
|
|
2593
|
+
)
|
|
2594
|
+
|
|
2595
|
+
# LLMLingua ML-based compression
|
|
2596
|
+
parser.add_argument(
|
|
2597
|
+
"--llmlingua",
|
|
2598
|
+
action="store_true",
|
|
2599
|
+
help="Enable LLMLingua-2 ML-based compression (requires: pip install headroom-ai[llmlingua])",
|
|
2600
|
+
)
|
|
2601
|
+
parser.add_argument(
|
|
2602
|
+
"--no-llmlingua",
|
|
2603
|
+
action="store_true",
|
|
2604
|
+
help="Disable LLMLingua compression",
|
|
2605
|
+
)
|
|
2606
|
+
parser.add_argument(
|
|
2607
|
+
"--llmlingua-device",
|
|
2608
|
+
choices=["auto", "cuda", "cpu", "mps"],
|
|
2609
|
+
default="auto",
|
|
2610
|
+
help="Device for LLMLingua model (default: auto)",
|
|
2611
|
+
)
|
|
2612
|
+
parser.add_argument(
|
|
2613
|
+
"--llmlingua-rate",
|
|
2614
|
+
type=float,
|
|
2615
|
+
default=0.3,
|
|
2616
|
+
help="LLMLingua target compression rate, 0.0-1.0 (default: 0.3 = keep 30%%)",
|
|
2617
|
+
)
|
|
2618
|
+
|
|
2619
|
+
# Code-aware compression
|
|
2620
|
+
parser.add_argument(
|
|
2621
|
+
"--code-aware",
|
|
2622
|
+
action="store_true",
|
|
2623
|
+
help="Enable AST-based code compression (requires: pip install headroom-ai[code])",
|
|
2624
|
+
)
|
|
2625
|
+
parser.add_argument(
|
|
2626
|
+
"--no-code-aware",
|
|
2627
|
+
action="store_true",
|
|
2628
|
+
help="Disable code-aware compression",
|
|
2629
|
+
)
|
|
2630
|
+
|
|
2631
|
+
args = parser.parse_args()
|
|
2632
|
+
|
|
2633
|
+
# Environment variable defaults (HEADROOM_* prefix)
|
|
2634
|
+
# CLI args override env vars, env vars override ProxyConfig defaults
|
|
2635
|
+
env_smart_routing = _get_env_bool("HEADROOM_SMART_ROUTING", True)
|
|
2636
|
+
env_llmlingua = _get_env_bool("HEADROOM_LLMLINGUA_ENABLED", True)
|
|
2637
|
+
env_code_aware = _get_env_bool("HEADROOM_CODE_AWARE_ENABLED", True)
|
|
2638
|
+
env_optimize = _get_env_bool("HEADROOM_OPTIMIZE", True)
|
|
2639
|
+
env_cache = _get_env_bool("HEADROOM_CACHE_ENABLED", True)
|
|
2640
|
+
env_rate_limit = _get_env_bool("HEADROOM_RATE_LIMIT_ENABLED", True)
|
|
2641
|
+
|
|
2642
|
+
# Determine settings: CLI flags override env vars
|
|
2643
|
+
# --no-X explicitly disables, --X explicitly enables, neither uses env var
|
|
2644
|
+
smart_routing = env_smart_routing if not args.no_smart_routing else False
|
|
2645
|
+
llmlingua_enabled = (
|
|
2646
|
+
env_llmlingua
|
|
2647
|
+
if not (args.llmlingua or args.no_llmlingua)
|
|
2648
|
+
else (args.llmlingua or not args.no_llmlingua)
|
|
2649
|
+
)
|
|
2650
|
+
code_aware_enabled = (
|
|
2651
|
+
env_code_aware
|
|
2652
|
+
if not (args.code_aware or args.no_code_aware)
|
|
2653
|
+
else (args.code_aware or not args.no_code_aware)
|
|
2654
|
+
)
|
|
2655
|
+
optimize = env_optimize if not args.no_optimize else False
|
|
2656
|
+
cache_enabled = env_cache if not args.no_cache else False
|
|
2657
|
+
rate_limit_enabled = env_rate_limit if not args.no_rate_limit else False
|
|
2658
|
+
|
|
2659
|
+
config = ProxyConfig(
|
|
2660
|
+
host=_get_env_str("HEADROOM_HOST", args.host),
|
|
2661
|
+
port=_get_env_int("HEADROOM_PORT", args.port),
|
|
2662
|
+
optimize=optimize,
|
|
2663
|
+
min_tokens_to_crush=_get_env_int("HEADROOM_MIN_TOKENS", args.min_tokens),
|
|
2664
|
+
max_items_after_crush=_get_env_int("HEADROOM_MAX_ITEMS", args.max_items),
|
|
2665
|
+
cache_enabled=cache_enabled,
|
|
2666
|
+
cache_ttl_seconds=_get_env_int("HEADROOM_CACHE_TTL", args.cache_ttl),
|
|
2667
|
+
rate_limit_enabled=rate_limit_enabled,
|
|
2668
|
+
rate_limit_requests_per_minute=_get_env_int("HEADROOM_RPM", args.rpm),
|
|
2669
|
+
rate_limit_tokens_per_minute=_get_env_int("HEADROOM_TPM", args.tpm),
|
|
2670
|
+
budget_limit_usd=args.budget,
|
|
2671
|
+
budget_period=args.budget_period,
|
|
2672
|
+
log_file=_get_env_str("HEADROOM_LOG_FILE", args.log_file)
|
|
2673
|
+
if args.log_file
|
|
2674
|
+
else os.environ.get("HEADROOM_LOG_FILE"),
|
|
2675
|
+
log_full_messages=args.log_messages or _get_env_bool("HEADROOM_LOG_MESSAGES", False),
|
|
2676
|
+
smart_routing=smart_routing,
|
|
2677
|
+
llmlingua_enabled=llmlingua_enabled,
|
|
2678
|
+
llmlingua_device=_get_env_str("HEADROOM_LLMLINGUA_DEVICE", args.llmlingua_device),
|
|
2679
|
+
llmlingua_target_rate=_get_env_float("HEADROOM_LLMLINGUA_RATE", args.llmlingua_rate),
|
|
2680
|
+
code_aware_enabled=code_aware_enabled,
|
|
2681
|
+
)
|
|
2682
|
+
|
|
2683
|
+
run_server(config)
|