headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
"""OpenAI-compatible provider for universal LLM support.
|
|
2
|
+
|
|
3
|
+
This provider supports any LLM service that implements the OpenAI API format:
|
|
4
|
+
- Ollama (local)
|
|
5
|
+
- vLLM (local/cloud)
|
|
6
|
+
- Together AI
|
|
7
|
+
- Groq
|
|
8
|
+
- Fireworks AI
|
|
9
|
+
- Anyscale
|
|
10
|
+
- LM Studio
|
|
11
|
+
- LocalAI
|
|
12
|
+
- Hugging Face Inference Endpoints
|
|
13
|
+
- Azure OpenAI
|
|
14
|
+
- And many more...
|
|
15
|
+
|
|
16
|
+
The key insight: 70%+ of LLM providers use OpenAI-compatible APIs,
|
|
17
|
+
so supporting this format gives near-universal coverage.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from headroom.tokenizers import get_tokenizer
|
|
27
|
+
|
|
28
|
+
from .base import Provider
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class ModelCapabilities:
|
|
35
|
+
"""Model capability metadata.
|
|
36
|
+
|
|
37
|
+
Stores information about a model's capabilities and constraints
|
|
38
|
+
that the provider needs for token counting and cost estimation.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model: str
|
|
42
|
+
context_window: int = 128000 # Default to 128K
|
|
43
|
+
max_output_tokens: int = 4096
|
|
44
|
+
supports_tools: bool = True
|
|
45
|
+
supports_vision: bool = False
|
|
46
|
+
supports_streaming: bool = True
|
|
47
|
+
tokenizer_backend: str | None = None # Force specific tokenizer
|
|
48
|
+
input_cost_per_1m: float | None = None # Cost per 1M input tokens
|
|
49
|
+
output_cost_per_1m: float | None = None # Cost per 1M output tokens
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Default context limits for common open models
|
|
53
|
+
# These are reasonable defaults; users can override
|
|
54
|
+
_DEFAULT_CONTEXT_LIMITS: dict[str, int] = {
|
|
55
|
+
# Llama 3 family
|
|
56
|
+
"llama-3": 8192,
|
|
57
|
+
"llama-3-8b": 8192,
|
|
58
|
+
"llama-3-70b": 8192,
|
|
59
|
+
"llama-3.1": 128000,
|
|
60
|
+
"llama-3.1-8b": 128000,
|
|
61
|
+
"llama-3.1-70b": 128000,
|
|
62
|
+
"llama-3.1-405b": 128000,
|
|
63
|
+
"llama-3.2": 128000,
|
|
64
|
+
"llama-3.3": 128000,
|
|
65
|
+
# Llama 2 family
|
|
66
|
+
"llama-2": 4096,
|
|
67
|
+
"llama-2-7b": 4096,
|
|
68
|
+
"llama-2-13b": 4096,
|
|
69
|
+
"llama-2-70b": 4096,
|
|
70
|
+
"codellama": 16384,
|
|
71
|
+
# Mistral family
|
|
72
|
+
"mistral": 32768,
|
|
73
|
+
"mistral-7b": 32768,
|
|
74
|
+
"mistral-nemo": 128000,
|
|
75
|
+
"mistral-small": 32768,
|
|
76
|
+
"mistral-large": 128000,
|
|
77
|
+
"mixtral": 32768,
|
|
78
|
+
"mixtral-8x7b": 32768,
|
|
79
|
+
"mixtral-8x22b": 65536,
|
|
80
|
+
# Qwen family
|
|
81
|
+
"qwen": 32768,
|
|
82
|
+
"qwen2": 32768,
|
|
83
|
+
"qwen2-7b": 32768,
|
|
84
|
+
"qwen2-72b": 32768,
|
|
85
|
+
"qwen2.5": 131072,
|
|
86
|
+
# DeepSeek
|
|
87
|
+
"deepseek": 32768,
|
|
88
|
+
"deepseek-coder": 16384,
|
|
89
|
+
"deepseek-v2": 128000,
|
|
90
|
+
"deepseek-v3": 128000,
|
|
91
|
+
# Yi
|
|
92
|
+
"yi": 32768,
|
|
93
|
+
"yi-34b": 32768,
|
|
94
|
+
# Phi
|
|
95
|
+
"phi-2": 2048,
|
|
96
|
+
"phi-3": 4096,
|
|
97
|
+
"phi-3-mini": 4096,
|
|
98
|
+
"phi-3-medium": 4096,
|
|
99
|
+
# Others
|
|
100
|
+
"falcon": 2048,
|
|
101
|
+
"falcon-40b": 2048,
|
|
102
|
+
"falcon-180b": 2048,
|
|
103
|
+
"gemma": 8192,
|
|
104
|
+
"gemma-2": 8192,
|
|
105
|
+
"starcoder": 8192,
|
|
106
|
+
"starcoder2": 16384,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class OpenAICompatibleTokenCounter:
|
|
111
|
+
"""Token counter for OpenAI-compatible providers.
|
|
112
|
+
|
|
113
|
+
Uses the TokenizerRegistry to get the appropriate tokenizer
|
|
114
|
+
for the model, falling back to estimation if needed.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(
|
|
118
|
+
self,
|
|
119
|
+
model: str,
|
|
120
|
+
tokenizer_backend: str | None = None,
|
|
121
|
+
):
|
|
122
|
+
"""Initialize token counter.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
model: Model name.
|
|
126
|
+
tokenizer_backend: Force specific tokenizer backend.
|
|
127
|
+
"""
|
|
128
|
+
self.model = model
|
|
129
|
+
self._tokenizer = get_tokenizer(model, backend=tokenizer_backend)
|
|
130
|
+
|
|
131
|
+
def count_text(self, text: str) -> int:
|
|
132
|
+
"""Count tokens in text."""
|
|
133
|
+
return self._tokenizer.count_text(text)
|
|
134
|
+
|
|
135
|
+
def count_message(self, message: dict[str, Any]) -> int:
|
|
136
|
+
"""Count tokens in a single message."""
|
|
137
|
+
# Use OpenAI-style message overhead
|
|
138
|
+
tokens = 4 # Base overhead
|
|
139
|
+
|
|
140
|
+
role = message.get("role", "")
|
|
141
|
+
tokens += self.count_text(role)
|
|
142
|
+
|
|
143
|
+
content = message.get("content")
|
|
144
|
+
if content:
|
|
145
|
+
if isinstance(content, str):
|
|
146
|
+
tokens += self.count_text(content)
|
|
147
|
+
elif isinstance(content, list):
|
|
148
|
+
for part in content:
|
|
149
|
+
if isinstance(part, dict):
|
|
150
|
+
if part.get("type") == "text":
|
|
151
|
+
tokens += self.count_text(part.get("text", ""))
|
|
152
|
+
elif isinstance(part, str):
|
|
153
|
+
tokens += self.count_text(part)
|
|
154
|
+
|
|
155
|
+
name = message.get("name")
|
|
156
|
+
if name:
|
|
157
|
+
tokens += self.count_text(name) + 1
|
|
158
|
+
|
|
159
|
+
tool_calls = message.get("tool_calls")
|
|
160
|
+
if tool_calls:
|
|
161
|
+
for tc in tool_calls:
|
|
162
|
+
func = tc.get("function", {})
|
|
163
|
+
tokens += self.count_text(func.get("name", ""))
|
|
164
|
+
tokens += self.count_text(func.get("arguments", ""))
|
|
165
|
+
tokens += 10
|
|
166
|
+
|
|
167
|
+
tool_call_id = message.get("tool_call_id")
|
|
168
|
+
if tool_call_id:
|
|
169
|
+
tokens += self.count_text(tool_call_id) + 2
|
|
170
|
+
|
|
171
|
+
return tokens
|
|
172
|
+
|
|
173
|
+
def count_messages(self, messages: list[dict[str, Any]]) -> int:
|
|
174
|
+
"""Count tokens in a list of messages."""
|
|
175
|
+
total = sum(self.count_message(msg) for msg in messages)
|
|
176
|
+
total += 3 # Priming tokens
|
|
177
|
+
return total
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class OpenAICompatibleProvider(Provider):
|
|
181
|
+
"""Provider for OpenAI-compatible LLM services.
|
|
182
|
+
|
|
183
|
+
Works with any service implementing the OpenAI chat completions API:
|
|
184
|
+
- Ollama (local)
|
|
185
|
+
- vLLM (local/cloud)
|
|
186
|
+
- Together AI
|
|
187
|
+
- Groq
|
|
188
|
+
- Fireworks AI
|
|
189
|
+
- LM Studio
|
|
190
|
+
- LocalAI
|
|
191
|
+
- And many more...
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
# For Ollama
|
|
195
|
+
provider = OpenAICompatibleProvider(
|
|
196
|
+
name="ollama",
|
|
197
|
+
base_url="http://localhost:11434/v1",
|
|
198
|
+
default_model="llama3.1",
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# For Together AI
|
|
202
|
+
provider = OpenAICompatibleProvider(
|
|
203
|
+
name="together",
|
|
204
|
+
base_url="https://api.together.xyz/v1",
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Get token counter for a specific model
|
|
208
|
+
counter = provider.get_token_counter("llama-3.1-8b")
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
def __init__(
|
|
212
|
+
self,
|
|
213
|
+
name: str = "openai_compatible",
|
|
214
|
+
base_url: str | None = None,
|
|
215
|
+
api_key: str | None = None,
|
|
216
|
+
default_model: str | None = None,
|
|
217
|
+
models: dict[str, ModelCapabilities] | None = None,
|
|
218
|
+
):
|
|
219
|
+
"""Initialize OpenAI-compatible provider.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
name: Provider name for identification.
|
|
223
|
+
base_url: API base URL (e.g., 'http://localhost:11434/v1').
|
|
224
|
+
api_key: API key (if required).
|
|
225
|
+
default_model: Default model for operations.
|
|
226
|
+
models: Custom model configurations.
|
|
227
|
+
"""
|
|
228
|
+
self._name = name
|
|
229
|
+
self.base_url = base_url
|
|
230
|
+
self.api_key = api_key
|
|
231
|
+
self.default_model = default_model
|
|
232
|
+
self._models: dict[str, ModelCapabilities] = models or {}
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def name(self) -> str:
|
|
236
|
+
return self._name
|
|
237
|
+
|
|
238
|
+
def register_model(
|
|
239
|
+
self,
|
|
240
|
+
model: str,
|
|
241
|
+
capabilities: ModelCapabilities | None = None,
|
|
242
|
+
**kwargs: Any,
|
|
243
|
+
) -> None:
|
|
244
|
+
"""Register a model with its capabilities.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
model: Model name.
|
|
248
|
+
capabilities: Model capabilities object.
|
|
249
|
+
**kwargs: Alternative way to specify capabilities.
|
|
250
|
+
"""
|
|
251
|
+
if capabilities is not None:
|
|
252
|
+
self._models[model] = capabilities
|
|
253
|
+
else:
|
|
254
|
+
self._models[model] = ModelCapabilities(model=model, **kwargs)
|
|
255
|
+
|
|
256
|
+
def supports_model(self, model: str) -> bool:
|
|
257
|
+
"""Check if model is supported.
|
|
258
|
+
|
|
259
|
+
OpenAI-compatible providers support any model by default,
|
|
260
|
+
using estimation for token counting.
|
|
261
|
+
"""
|
|
262
|
+
return True # Always return True - we can estimate
|
|
263
|
+
|
|
264
|
+
def get_token_counter(self, model: str) -> OpenAICompatibleTokenCounter:
|
|
265
|
+
"""Get token counter for a model.
|
|
266
|
+
|
|
267
|
+
Uses the TokenizerRegistry to find the best tokenizer,
|
|
268
|
+
with fallback to estimation.
|
|
269
|
+
"""
|
|
270
|
+
tokenizer_backend = None
|
|
271
|
+
|
|
272
|
+
# Check for registered model with specific tokenizer
|
|
273
|
+
if model in self._models:
|
|
274
|
+
tokenizer_backend = self._models[model].tokenizer_backend
|
|
275
|
+
|
|
276
|
+
return OpenAICompatibleTokenCounter(model, tokenizer_backend)
|
|
277
|
+
|
|
278
|
+
def get_context_limit(self, model: str) -> int:
|
|
279
|
+
"""Get context limit for a model.
|
|
280
|
+
|
|
281
|
+
Priority:
|
|
282
|
+
1. Registered model capabilities
|
|
283
|
+
2. Default limits for known models
|
|
284
|
+
3. Prefix matching
|
|
285
|
+
4. Default 128K
|
|
286
|
+
"""
|
|
287
|
+
# Check registered models
|
|
288
|
+
if model in self._models:
|
|
289
|
+
return self._models[model].context_window
|
|
290
|
+
|
|
291
|
+
model_lower = model.lower()
|
|
292
|
+
|
|
293
|
+
# Check default limits
|
|
294
|
+
if model_lower in _DEFAULT_CONTEXT_LIMITS:
|
|
295
|
+
return _DEFAULT_CONTEXT_LIMITS[model_lower]
|
|
296
|
+
|
|
297
|
+
# Prefix match
|
|
298
|
+
for prefix, limit in _DEFAULT_CONTEXT_LIMITS.items():
|
|
299
|
+
if model_lower.startswith(prefix):
|
|
300
|
+
return limit
|
|
301
|
+
|
|
302
|
+
# Default to 128K for modern models
|
|
303
|
+
return 128000
|
|
304
|
+
|
|
305
|
+
def get_output_buffer(self, model: str, default: int = 4000) -> int:
|
|
306
|
+
"""Get recommended output buffer."""
|
|
307
|
+
if model in self._models:
|
|
308
|
+
return min(self._models[model].max_output_tokens, default)
|
|
309
|
+
return default
|
|
310
|
+
|
|
311
|
+
def estimate_cost(
|
|
312
|
+
self,
|
|
313
|
+
input_tokens: int,
|
|
314
|
+
output_tokens: int,
|
|
315
|
+
model: str,
|
|
316
|
+
cached_tokens: int = 0,
|
|
317
|
+
) -> float | None:
|
|
318
|
+
"""Estimate cost if pricing is configured.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
input_tokens: Number of input tokens.
|
|
322
|
+
output_tokens: Number of output tokens.
|
|
323
|
+
model: Model name.
|
|
324
|
+
cached_tokens: Number of cached tokens.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Estimated cost in USD, or None if pricing unknown.
|
|
328
|
+
"""
|
|
329
|
+
if model not in self._models:
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
caps = self._models[model]
|
|
333
|
+
if caps.input_cost_per_1m is None or caps.output_cost_per_1m is None:
|
|
334
|
+
return None
|
|
335
|
+
|
|
336
|
+
input_cost = (input_tokens / 1_000_000) * caps.input_cost_per_1m
|
|
337
|
+
output_cost = (output_tokens / 1_000_000) * caps.output_cost_per_1m
|
|
338
|
+
|
|
339
|
+
return input_cost + output_cost
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# Pre-configured provider factories for common services
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def create_ollama_provider(
|
|
346
|
+
base_url: str = "http://localhost:11434/v1",
|
|
347
|
+
) -> OpenAICompatibleProvider:
|
|
348
|
+
"""Create provider for Ollama.
|
|
349
|
+
|
|
350
|
+
Ollama is a popular local LLM runner that supports many open models.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
base_url: Ollama API URL (default: http://localhost:11434/v1).
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Configured provider.
|
|
357
|
+
"""
|
|
358
|
+
return OpenAICompatibleProvider(
|
|
359
|
+
name="ollama",
|
|
360
|
+
base_url=base_url,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def create_together_provider(
|
|
365
|
+
api_key: str | None = None,
|
|
366
|
+
) -> OpenAICompatibleProvider:
|
|
367
|
+
"""Create provider for Together AI.
|
|
368
|
+
|
|
369
|
+
Together AI offers high-performance inference for open models.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
api_key: Together AI API key.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
Configured provider with Together AI pricing.
|
|
376
|
+
"""
|
|
377
|
+
provider = OpenAICompatibleProvider(
|
|
378
|
+
name="together",
|
|
379
|
+
base_url="https://api.together.xyz/v1",
|
|
380
|
+
api_key=api_key,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Register common Together models with pricing
|
|
384
|
+
# Pricing as of Jan 2025 (verify current rates)
|
|
385
|
+
provider.register_model(
|
|
386
|
+
"meta-llama/Llama-3.1-8B-Instruct-Turbo",
|
|
387
|
+
context_window=128000,
|
|
388
|
+
input_cost_per_1m=0.18,
|
|
389
|
+
output_cost_per_1m=0.18,
|
|
390
|
+
)
|
|
391
|
+
provider.register_model(
|
|
392
|
+
"meta-llama/Llama-3.1-70B-Instruct-Turbo",
|
|
393
|
+
context_window=128000,
|
|
394
|
+
input_cost_per_1m=0.88,
|
|
395
|
+
output_cost_per_1m=0.88,
|
|
396
|
+
)
|
|
397
|
+
provider.register_model(
|
|
398
|
+
"meta-llama/Llama-3.1-405B-Instruct-Turbo",
|
|
399
|
+
context_window=128000,
|
|
400
|
+
input_cost_per_1m=3.50,
|
|
401
|
+
output_cost_per_1m=3.50,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
return provider
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def create_groq_provider(
|
|
408
|
+
api_key: str | None = None,
|
|
409
|
+
) -> OpenAICompatibleProvider:
|
|
410
|
+
"""Create provider for Groq.
|
|
411
|
+
|
|
412
|
+
Groq offers ultra-fast inference on custom hardware.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
api_key: Groq API key.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
Configured provider with Groq pricing.
|
|
419
|
+
"""
|
|
420
|
+
provider = OpenAICompatibleProvider(
|
|
421
|
+
name="groq",
|
|
422
|
+
base_url="https://api.groq.com/openai/v1",
|
|
423
|
+
api_key=api_key,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# Register common Groq models with pricing
|
|
427
|
+
# Pricing as of Jan 2025 (verify current rates)
|
|
428
|
+
provider.register_model(
|
|
429
|
+
"llama-3.1-8b-instant",
|
|
430
|
+
context_window=128000,
|
|
431
|
+
input_cost_per_1m=0.05,
|
|
432
|
+
output_cost_per_1m=0.08,
|
|
433
|
+
)
|
|
434
|
+
provider.register_model(
|
|
435
|
+
"llama-3.1-70b-versatile",
|
|
436
|
+
context_window=128000,
|
|
437
|
+
input_cost_per_1m=0.59,
|
|
438
|
+
output_cost_per_1m=0.79,
|
|
439
|
+
)
|
|
440
|
+
provider.register_model(
|
|
441
|
+
"mixtral-8x7b-32768",
|
|
442
|
+
context_window=32768,
|
|
443
|
+
input_cost_per_1m=0.24,
|
|
444
|
+
output_cost_per_1m=0.24,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
return provider
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def create_fireworks_provider(
|
|
451
|
+
api_key: str | None = None,
|
|
452
|
+
) -> OpenAICompatibleProvider:
|
|
453
|
+
"""Create provider for Fireworks AI.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
api_key: Fireworks API key.
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
Configured provider.
|
|
460
|
+
"""
|
|
461
|
+
return OpenAICompatibleProvider(
|
|
462
|
+
name="fireworks",
|
|
463
|
+
base_url="https://api.fireworks.ai/inference/v1",
|
|
464
|
+
api_key=api_key,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def create_anyscale_provider(
|
|
469
|
+
api_key: str | None = None,
|
|
470
|
+
) -> OpenAICompatibleProvider:
|
|
471
|
+
"""Create provider for Anyscale Endpoints.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
api_key: Anyscale API key.
|
|
475
|
+
|
|
476
|
+
Returns:
|
|
477
|
+
Configured provider.
|
|
478
|
+
"""
|
|
479
|
+
return OpenAICompatibleProvider(
|
|
480
|
+
name="anyscale",
|
|
481
|
+
base_url="https://api.endpoints.anyscale.com/v1",
|
|
482
|
+
api_key=api_key,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def create_vllm_provider(
|
|
487
|
+
base_url: str,
|
|
488
|
+
) -> OpenAICompatibleProvider:
|
|
489
|
+
"""Create provider for vLLM server.
|
|
490
|
+
|
|
491
|
+
vLLM is a high-performance inference engine.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
base_url: vLLM server URL (e.g., 'http://localhost:8000/v1').
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
Configured provider.
|
|
498
|
+
"""
|
|
499
|
+
return OpenAICompatibleProvider(
|
|
500
|
+
name="vllm",
|
|
501
|
+
base_url=base_url,
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def create_lmstudio_provider(
|
|
506
|
+
base_url: str = "http://localhost:1234/v1",
|
|
507
|
+
) -> OpenAICompatibleProvider:
|
|
508
|
+
"""Create provider for LM Studio.
|
|
509
|
+
|
|
510
|
+
LM Studio is a desktop app for running local LLMs.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
base_url: LM Studio API URL.
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
Configured provider.
|
|
517
|
+
"""
|
|
518
|
+
return OpenAICompatibleProvider(
|
|
519
|
+
name="lmstudio",
|
|
520
|
+
base_url=base_url,
|
|
521
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Headroom Proxy Server.
|
|
2
|
+
|
|
3
|
+
A transparent proxy that sits between LLM clients (Claude Code, Cursor, etc.)
|
|
4
|
+
and LLM APIs (Anthropic, OpenAI), applying Headroom optimizations.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
# Start the proxy
|
|
8
|
+
python -m headroom.proxy.server
|
|
9
|
+
|
|
10
|
+
# Use with Claude Code
|
|
11
|
+
ANTHROPIC_BASE_URL=http://localhost:8787 claude
|
|
12
|
+
|
|
13
|
+
# Use with Cursor (if using Anthropic)
|
|
14
|
+
Set base URL in Cursor settings to http://localhost:8787
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .server import create_app, run_server
|
|
18
|
+
|
|
19
|
+
__all__ = ["create_app", "run_server"]
|