headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
"""Agno model wrapper for Headroom optimization.
|
|
2
|
+
|
|
3
|
+
This module provides HeadroomAgnoModel, which wraps any Agno model
|
|
4
|
+
to apply Headroom context optimization before API calls.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import logging
|
|
11
|
+
import threading
|
|
12
|
+
import warnings
|
|
13
|
+
from collections.abc import AsyncIterator, Iterator
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from typing import Any
|
|
17
|
+
from uuid import uuid4
|
|
18
|
+
|
|
19
|
+
# Agno imports - these are optional dependencies
|
|
20
|
+
try:
|
|
21
|
+
from agno.models.base import Model
|
|
22
|
+
from agno.models.message import Message
|
|
23
|
+
from agno.models.response import ModelResponse
|
|
24
|
+
|
|
25
|
+
AGNO_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
AGNO_AVAILABLE = False
|
|
28
|
+
Model = object # type: ignore[misc,assignment]
|
|
29
|
+
Message = dict # type: ignore[misc,assignment]
|
|
30
|
+
ModelResponse = dict # type: ignore[misc,assignment]
|
|
31
|
+
|
|
32
|
+
from headroom import HeadroomConfig, HeadroomMode
|
|
33
|
+
from headroom.providers import OpenAIProvider
|
|
34
|
+
from headroom.transforms import TransformPipeline
|
|
35
|
+
|
|
36
|
+
from .providers import get_headroom_provider, get_model_name_from_agno
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _check_agno_available() -> None:
|
|
42
|
+
"""Raise ImportError if Agno is not installed."""
|
|
43
|
+
if not AGNO_AVAILABLE:
|
|
44
|
+
raise ImportError("Agno is required for this integration. Install with: pip install agno")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def agno_available() -> bool:
|
|
48
|
+
"""Check if Agno is installed."""
|
|
49
|
+
return AGNO_AVAILABLE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class OptimizationMetrics:
|
|
54
|
+
"""Metrics from a single optimization pass."""
|
|
55
|
+
|
|
56
|
+
request_id: str
|
|
57
|
+
timestamp: datetime
|
|
58
|
+
tokens_before: int
|
|
59
|
+
tokens_after: int
|
|
60
|
+
tokens_saved: int
|
|
61
|
+
savings_percent: float
|
|
62
|
+
transforms_applied: list[str]
|
|
63
|
+
model: str
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class HeadroomAgnoModel(Model): # type: ignore[misc]
|
|
68
|
+
"""Agno model wrapper that applies Headroom optimizations.
|
|
69
|
+
|
|
70
|
+
Extends agno.models.base.Model to be fully compatible with Agno Agent.
|
|
71
|
+
Wraps any Agno Model and automatically optimizes the context
|
|
72
|
+
before each API call. Works with OpenAIChat, Claude, Gemini, and
|
|
73
|
+
other Agno model types.
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
from agno.agent import Agent
|
|
77
|
+
from agno.models.openai import OpenAIChat
|
|
78
|
+
from headroom.integrations.agno import HeadroomAgnoModel
|
|
79
|
+
|
|
80
|
+
# Basic usage
|
|
81
|
+
model = OpenAIChat(id="gpt-4o")
|
|
82
|
+
optimized = HeadroomAgnoModel(wrapped_model=model)
|
|
83
|
+
|
|
84
|
+
# Use with agent
|
|
85
|
+
agent = Agent(model=optimized)
|
|
86
|
+
response = agent.run("Hello!")
|
|
87
|
+
|
|
88
|
+
# Access metrics
|
|
89
|
+
print(f"Saved {optimized.total_tokens_saved} tokens")
|
|
90
|
+
|
|
91
|
+
# With custom config
|
|
92
|
+
from headroom import HeadroomConfig, HeadroomMode
|
|
93
|
+
config = HeadroomConfig(default_mode=HeadroomMode.OPTIMIZE)
|
|
94
|
+
optimized = HeadroomAgnoModel(wrapped_model=model, headroom_config=config)
|
|
95
|
+
|
|
96
|
+
Attributes:
|
|
97
|
+
wrapped_model: The underlying Agno model
|
|
98
|
+
total_tokens_saved: Running total of tokens saved
|
|
99
|
+
metrics_history: List of OptimizationMetrics from recent calls
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
# Required by Model base class - we'll derive from wrapped model
|
|
103
|
+
id: str = field(default="headroom-wrapper")
|
|
104
|
+
name: str | None = field(default=None)
|
|
105
|
+
provider: str | None = field(default=None)
|
|
106
|
+
|
|
107
|
+
# HeadroomAgnoModel specific fields
|
|
108
|
+
wrapped_model: Any = field(default=None)
|
|
109
|
+
headroom_config: HeadroomConfig | None = field(default=None)
|
|
110
|
+
headroom_mode: HeadroomMode | None = field(default=None)
|
|
111
|
+
auto_detect_provider: bool = field(default=True)
|
|
112
|
+
|
|
113
|
+
# Internal state (not part of dataclass comparison)
|
|
114
|
+
_metrics_history: list[OptimizationMetrics] = field(
|
|
115
|
+
default_factory=list, repr=False, compare=False
|
|
116
|
+
)
|
|
117
|
+
_total_tokens_saved: int = field(default=0, repr=False, compare=False)
|
|
118
|
+
_pipeline: TransformPipeline | None = field(default=None, repr=False, compare=False)
|
|
119
|
+
_headroom_provider: Any = field(default=None, repr=False, compare=False)
|
|
120
|
+
_lock: threading.Lock = field(default_factory=threading.Lock, repr=False, compare=False)
|
|
121
|
+
_initialized: bool = field(default=False, repr=False, compare=False)
|
|
122
|
+
|
|
123
|
+
def __post_init__(self) -> None:
|
|
124
|
+
"""Initialize HeadroomAgnoModel after dataclass construction."""
|
|
125
|
+
_check_agno_available()
|
|
126
|
+
|
|
127
|
+
if self.wrapped_model is None:
|
|
128
|
+
raise ValueError("wrapped_model cannot be None")
|
|
129
|
+
|
|
130
|
+
# Set id from wrapped model
|
|
131
|
+
if hasattr(self.wrapped_model, "id"):
|
|
132
|
+
self.id = f"headroom:{self.wrapped_model.id}"
|
|
133
|
+
|
|
134
|
+
# Set name and provider from wrapped model for compatibility
|
|
135
|
+
if self.name is None and hasattr(self.wrapped_model, "name"):
|
|
136
|
+
self.name = self.wrapped_model.name
|
|
137
|
+
if self.provider is None and hasattr(self.wrapped_model, "provider"):
|
|
138
|
+
self.provider = self.wrapped_model.provider
|
|
139
|
+
|
|
140
|
+
# Initialize config
|
|
141
|
+
if self.headroom_config is None:
|
|
142
|
+
self.headroom_config = HeadroomConfig()
|
|
143
|
+
|
|
144
|
+
# Handle deprecated mode parameter
|
|
145
|
+
if self.headroom_mode is not None:
|
|
146
|
+
warnings.warn(
|
|
147
|
+
"The 'headroom_mode' parameter is deprecated. Use HeadroomConfig(default_mode=...) instead.",
|
|
148
|
+
DeprecationWarning,
|
|
149
|
+
stacklevel=2,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
self._initialized = True
|
|
153
|
+
|
|
154
|
+
# Call parent __post_init__ if it exists
|
|
155
|
+
if hasattr(super(), "__post_init__"):
|
|
156
|
+
super().__post_init__()
|
|
157
|
+
|
|
158
|
+
# Forward attribute access to wrapped model for compatibility
|
|
159
|
+
def __getattr__(self, name: str) -> Any:
|
|
160
|
+
"""Forward attribute access to wrapped model."""
|
|
161
|
+
# Avoid infinite recursion during initialization
|
|
162
|
+
if name.startswith("_") or not self.__dict__.get("_initialized", False):
|
|
163
|
+
raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
|
|
164
|
+
if name in (
|
|
165
|
+
"wrapped_model",
|
|
166
|
+
"headroom_config",
|
|
167
|
+
"headroom_mode",
|
|
168
|
+
"auto_detect_provider",
|
|
169
|
+
"pipeline",
|
|
170
|
+
"total_tokens_saved",
|
|
171
|
+
"metrics_history",
|
|
172
|
+
"id",
|
|
173
|
+
"name",
|
|
174
|
+
"provider",
|
|
175
|
+
):
|
|
176
|
+
raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
|
|
177
|
+
return getattr(self.wrapped_model, name)
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def pipeline(self) -> TransformPipeline:
|
|
181
|
+
"""Lazily initialize TransformPipeline (thread-safe)."""
|
|
182
|
+
if self._pipeline is None:
|
|
183
|
+
with self._lock:
|
|
184
|
+
# Double-check after acquiring lock
|
|
185
|
+
if self._pipeline is None:
|
|
186
|
+
if self.auto_detect_provider:
|
|
187
|
+
self._headroom_provider = get_headroom_provider(self.wrapped_model)
|
|
188
|
+
logger.debug(
|
|
189
|
+
f"Auto-detected provider: {self._headroom_provider.__class__.__name__}"
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
self._headroom_provider = OpenAIProvider()
|
|
193
|
+
self._pipeline = TransformPipeline(
|
|
194
|
+
config=self.headroom_config,
|
|
195
|
+
provider=self._headroom_provider,
|
|
196
|
+
)
|
|
197
|
+
return self._pipeline
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def total_tokens_saved(self) -> int:
|
|
201
|
+
"""Total tokens saved across all calls."""
|
|
202
|
+
return self._total_tokens_saved
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def metrics_history(self) -> list[OptimizationMetrics]:
|
|
206
|
+
"""History of optimization metrics."""
|
|
207
|
+
return self._metrics_history.copy()
|
|
208
|
+
|
|
209
|
+
def _convert_messages_to_openai(self, messages: list[Any]) -> list[dict[str, Any]]:
|
|
210
|
+
"""Convert Agno messages to OpenAI format for Headroom."""
|
|
211
|
+
result = []
|
|
212
|
+
for msg in messages:
|
|
213
|
+
# Handle Agno Message objects
|
|
214
|
+
if hasattr(msg, "role") and hasattr(msg, "content"):
|
|
215
|
+
entry: dict[str, Any] = {
|
|
216
|
+
"role": msg.role,
|
|
217
|
+
"content": msg.content if msg.content is not None else "",
|
|
218
|
+
}
|
|
219
|
+
# Handle tool calls
|
|
220
|
+
if hasattr(msg, "tool_calls") and msg.tool_calls:
|
|
221
|
+
entry["tool_calls"] = msg.tool_calls
|
|
222
|
+
# Handle tool call ID for tool responses
|
|
223
|
+
if hasattr(msg, "tool_call_id") and msg.tool_call_id:
|
|
224
|
+
entry["tool_call_id"] = msg.tool_call_id
|
|
225
|
+
result.append(entry)
|
|
226
|
+
# Handle dict format
|
|
227
|
+
elif isinstance(msg, dict):
|
|
228
|
+
result.append(msg.copy())
|
|
229
|
+
else:
|
|
230
|
+
# Try to extract content
|
|
231
|
+
content = str(msg) if msg is not None else ""
|
|
232
|
+
result.append({"role": "user", "content": content})
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
def _convert_messages_from_openai(self, messages: list[dict[str, Any]]) -> list[Any]:
|
|
236
|
+
"""Convert OpenAI format messages back to Agno format.
|
|
237
|
+
|
|
238
|
+
Note: Agno typically accepts OpenAI-format dicts directly,
|
|
239
|
+
so we may not need full conversion.
|
|
240
|
+
"""
|
|
241
|
+
# Agno models generally accept OpenAI-format messages
|
|
242
|
+
# Return as-is for compatibility
|
|
243
|
+
return messages
|
|
244
|
+
|
|
245
|
+
def _optimize_messages(self, messages: list[Any]) -> tuple[list[Any], OptimizationMetrics]:
|
|
246
|
+
"""Apply Headroom optimization to messages.
|
|
247
|
+
|
|
248
|
+
Thread-safe with fallback on pipeline errors.
|
|
249
|
+
"""
|
|
250
|
+
request_id = str(uuid4())
|
|
251
|
+
|
|
252
|
+
# Convert to OpenAI format
|
|
253
|
+
openai_messages = self._convert_messages_to_openai(messages)
|
|
254
|
+
|
|
255
|
+
# Handle empty messages gracefully
|
|
256
|
+
if not openai_messages:
|
|
257
|
+
metrics = OptimizationMetrics(
|
|
258
|
+
request_id=request_id,
|
|
259
|
+
timestamp=datetime.now(timezone.utc),
|
|
260
|
+
tokens_before=0,
|
|
261
|
+
tokens_after=0,
|
|
262
|
+
tokens_saved=0,
|
|
263
|
+
savings_percent=0,
|
|
264
|
+
transforms_applied=[],
|
|
265
|
+
model=get_model_name_from_agno(self.wrapped_model),
|
|
266
|
+
)
|
|
267
|
+
return openai_messages, metrics
|
|
268
|
+
|
|
269
|
+
# Get model name from wrapped model
|
|
270
|
+
model = get_model_name_from_agno(self.wrapped_model)
|
|
271
|
+
|
|
272
|
+
# Ensure pipeline is initialized
|
|
273
|
+
_ = self.pipeline
|
|
274
|
+
|
|
275
|
+
# Get model context limit
|
|
276
|
+
model_limit = (
|
|
277
|
+
self._headroom_provider.get_context_limit(model) if self._headroom_provider else 128000
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
# Apply Headroom transforms via pipeline
|
|
282
|
+
result = self.pipeline.apply(
|
|
283
|
+
messages=openai_messages,
|
|
284
|
+
model=model,
|
|
285
|
+
model_limit=model_limit,
|
|
286
|
+
)
|
|
287
|
+
optimized = result.messages
|
|
288
|
+
tokens_before = result.tokens_before
|
|
289
|
+
tokens_after = result.tokens_after
|
|
290
|
+
transforms_applied = result.transforms_applied
|
|
291
|
+
except (
|
|
292
|
+
ValueError,
|
|
293
|
+
TypeError,
|
|
294
|
+
AttributeError,
|
|
295
|
+
RuntimeError,
|
|
296
|
+
KeyError,
|
|
297
|
+
IndexError,
|
|
298
|
+
ImportError,
|
|
299
|
+
OSError,
|
|
300
|
+
) as e:
|
|
301
|
+
# Fallback to original messages on pipeline error
|
|
302
|
+
# Log at warning level (degraded behavior, not critical failure)
|
|
303
|
+
logger.warning(
|
|
304
|
+
f"Headroom optimization failed, using original messages: {type(e).__name__}: {e}"
|
|
305
|
+
)
|
|
306
|
+
optimized = openai_messages
|
|
307
|
+
# Estimate token count for unoptimized messages (rough approximation)
|
|
308
|
+
# Note: This uses ~4 chars/token which is approximate for English text
|
|
309
|
+
tokens_before = sum(len(str(m.get("content", ""))) // 4 for m in openai_messages)
|
|
310
|
+
tokens_after = tokens_before # No optimization occurred
|
|
311
|
+
transforms_applied = ["fallback:error"]
|
|
312
|
+
|
|
313
|
+
# Create metrics
|
|
314
|
+
tokens_saved = max(0, tokens_before - tokens_after) # Never negative
|
|
315
|
+
metrics = OptimizationMetrics(
|
|
316
|
+
request_id=request_id,
|
|
317
|
+
timestamp=datetime.now(timezone.utc),
|
|
318
|
+
tokens_before=tokens_before,
|
|
319
|
+
tokens_after=tokens_after,
|
|
320
|
+
tokens_saved=tokens_saved,
|
|
321
|
+
savings_percent=(tokens_saved / tokens_before * 100 if tokens_before > 0 else 0),
|
|
322
|
+
transforms_applied=transforms_applied,
|
|
323
|
+
model=model,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Track metrics (thread-safe)
|
|
327
|
+
with self._lock:
|
|
328
|
+
self._metrics_history.append(metrics)
|
|
329
|
+
self._total_tokens_saved += metrics.tokens_saved
|
|
330
|
+
|
|
331
|
+
# Keep only last 100 metrics
|
|
332
|
+
if len(self._metrics_history) > 100:
|
|
333
|
+
self._metrics_history = self._metrics_history[-100:]
|
|
334
|
+
|
|
335
|
+
# Convert back (Agno accepts OpenAI format)
|
|
336
|
+
optimized_messages = self._convert_messages_from_openai(optimized)
|
|
337
|
+
|
|
338
|
+
return optimized_messages, metrics
|
|
339
|
+
|
|
340
|
+
def response(self, messages: list[Any], **kwargs: Any) -> Any: # type: ignore[override]
|
|
341
|
+
"""Generate response with Headroom optimization.
|
|
342
|
+
|
|
343
|
+
This is the core method that Agno agents call.
|
|
344
|
+
"""
|
|
345
|
+
# Optimize messages
|
|
346
|
+
optimized_messages, metrics = self._optimize_messages(messages)
|
|
347
|
+
|
|
348
|
+
logger.info(
|
|
349
|
+
f"Headroom optimized: {metrics.tokens_before} -> {metrics.tokens_after} tokens "
|
|
350
|
+
f"({metrics.savings_percent:.1f}% saved)"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Call wrapped model with optimized messages
|
|
354
|
+
return self.wrapped_model.response(optimized_messages, **kwargs)
|
|
355
|
+
|
|
356
|
+
def response_stream(self, messages: list[Any], **kwargs: Any) -> Iterator[Any]: # type: ignore[override]
|
|
357
|
+
"""Stream response with Headroom optimization."""
|
|
358
|
+
# Optimize messages
|
|
359
|
+
optimized_messages, metrics = self._optimize_messages(messages)
|
|
360
|
+
|
|
361
|
+
logger.info(
|
|
362
|
+
f"Headroom optimized (streaming): {metrics.tokens_before} -> "
|
|
363
|
+
f"{metrics.tokens_after} tokens"
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Stream from wrapped model
|
|
367
|
+
yield from self.wrapped_model.response_stream(optimized_messages, **kwargs)
|
|
368
|
+
|
|
369
|
+
async def aresponse(self, messages: list[Any], **kwargs: Any) -> Any: # type: ignore[override]
|
|
370
|
+
"""Async generate response with Headroom optimization."""
|
|
371
|
+
# Run optimization in executor (CPU-bound)
|
|
372
|
+
loop = asyncio.get_running_loop()
|
|
373
|
+
optimized_messages, metrics = await loop.run_in_executor(
|
|
374
|
+
None, self._optimize_messages, messages
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
logger.info(
|
|
378
|
+
f"Headroom optimized (async): {metrics.tokens_before} -> {metrics.tokens_after} tokens "
|
|
379
|
+
f"({metrics.savings_percent:.1f}% saved)"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Call wrapped model's async method
|
|
383
|
+
if hasattr(self.wrapped_model, "aresponse"):
|
|
384
|
+
return await self.wrapped_model.aresponse(optimized_messages, **kwargs)
|
|
385
|
+
else:
|
|
386
|
+
# Fallback to sync in executor (non-blocking)
|
|
387
|
+
return await loop.run_in_executor(
|
|
388
|
+
None, lambda: self.wrapped_model.response(optimized_messages, **kwargs)
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
async def aresponse_stream(self, messages: list[Any], **kwargs: Any) -> AsyncIterator[Any]: # type: ignore[override]
|
|
392
|
+
"""Async stream response with Headroom optimization."""
|
|
393
|
+
# Run optimization in executor (CPU-bound)
|
|
394
|
+
loop = asyncio.get_running_loop()
|
|
395
|
+
optimized_messages, metrics = await loop.run_in_executor(
|
|
396
|
+
None, self._optimize_messages, messages
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
logger.info(
|
|
400
|
+
f"Headroom optimized (async streaming): {metrics.tokens_before} -> "
|
|
401
|
+
f"{metrics.tokens_after} tokens"
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Async stream from wrapped model
|
|
405
|
+
if hasattr(self.wrapped_model, "aresponse_stream"):
|
|
406
|
+
async for chunk in self.wrapped_model.aresponse_stream(optimized_messages, **kwargs):
|
|
407
|
+
yield chunk
|
|
408
|
+
else:
|
|
409
|
+
# Fallback: wrap sync streaming in async iterator (non-blocking)
|
|
410
|
+
# Run the entire sync iteration in executor to avoid blocking event loop
|
|
411
|
+
def _sync_stream() -> list[Any]:
|
|
412
|
+
return list(self.wrapped_model.response_stream(optimized_messages, **kwargs))
|
|
413
|
+
|
|
414
|
+
chunks = await loop.run_in_executor(None, _sync_stream)
|
|
415
|
+
for chunk in chunks:
|
|
416
|
+
yield chunk
|
|
417
|
+
|
|
418
|
+
def get_savings_summary(self) -> dict[str, Any]:
|
|
419
|
+
"""Get summary of token savings."""
|
|
420
|
+
if not self._metrics_history:
|
|
421
|
+
return {
|
|
422
|
+
"total_requests": 0,
|
|
423
|
+
"total_tokens_saved": 0,
|
|
424
|
+
"average_savings_percent": 0,
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
return {
|
|
428
|
+
"total_requests": len(self._metrics_history),
|
|
429
|
+
"total_tokens_saved": self._total_tokens_saved,
|
|
430
|
+
"average_savings_percent": sum(m.savings_percent for m in self._metrics_history)
|
|
431
|
+
/ len(self._metrics_history),
|
|
432
|
+
"total_tokens_before": sum(m.tokens_before for m in self._metrics_history),
|
|
433
|
+
"total_tokens_after": sum(m.tokens_after for m in self._metrics_history),
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
def reset(self) -> None:
|
|
437
|
+
"""Reset all tracked metrics (thread-safe).
|
|
438
|
+
|
|
439
|
+
Clears the metrics history and resets the total tokens saved counter.
|
|
440
|
+
Useful for starting fresh measurements or between test runs.
|
|
441
|
+
"""
|
|
442
|
+
with self._lock:
|
|
443
|
+
self._metrics_history = []
|
|
444
|
+
self._total_tokens_saved = 0
|
|
445
|
+
|
|
446
|
+
# =========================================================================
|
|
447
|
+
# Abstract method implementations required by agno.models.base.Model
|
|
448
|
+
# These delegate to the wrapped model after applying Headroom optimization
|
|
449
|
+
# =========================================================================
|
|
450
|
+
|
|
451
|
+
def invoke(self, messages: list[Any], **kwargs: Any) -> Any:
|
|
452
|
+
"""Invoke the wrapped model with optimized messages.
|
|
453
|
+
|
|
454
|
+
This is required by agno.models.base.Model abstract interface.
|
|
455
|
+
"""
|
|
456
|
+
# Optimize messages before invoking
|
|
457
|
+
optimized_messages, metrics = self._optimize_messages(messages)
|
|
458
|
+
|
|
459
|
+
logger.info(
|
|
460
|
+
f"Headroom optimized (invoke): {metrics.tokens_before} -> {metrics.tokens_after} tokens "
|
|
461
|
+
f"({metrics.savings_percent:.1f}% saved)"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# Delegate to wrapped model
|
|
465
|
+
return self.wrapped_model.invoke(optimized_messages, **kwargs)
|
|
466
|
+
|
|
467
|
+
async def ainvoke(self, messages: list[Any], **kwargs: Any) -> Any:
|
|
468
|
+
"""Async invoke the wrapped model with optimized messages.
|
|
469
|
+
|
|
470
|
+
This is required by agno.models.base.Model abstract interface.
|
|
471
|
+
"""
|
|
472
|
+
# Run optimization in executor (CPU-bound)
|
|
473
|
+
loop = asyncio.get_running_loop()
|
|
474
|
+
optimized_messages, metrics = await loop.run_in_executor(
|
|
475
|
+
None, self._optimize_messages, messages
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
logger.info(
|
|
479
|
+
f"Headroom optimized (ainvoke): {metrics.tokens_before} -> {metrics.tokens_after} tokens "
|
|
480
|
+
f"({metrics.savings_percent:.1f}% saved)"
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Delegate to wrapped model
|
|
484
|
+
if hasattr(self.wrapped_model, "ainvoke"):
|
|
485
|
+
return await self.wrapped_model.ainvoke(optimized_messages, **kwargs)
|
|
486
|
+
else:
|
|
487
|
+
# Fallback to sync in executor
|
|
488
|
+
return await loop.run_in_executor(
|
|
489
|
+
None, lambda: self.wrapped_model.invoke(optimized_messages, **kwargs)
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
def invoke_stream(self, messages: list[Any], **kwargs: Any) -> Iterator[Any]:
|
|
493
|
+
"""Stream invoke the wrapped model with optimized messages.
|
|
494
|
+
|
|
495
|
+
This is required by agno.models.base.Model abstract interface.
|
|
496
|
+
"""
|
|
497
|
+
# Optimize messages before streaming
|
|
498
|
+
optimized_messages, metrics = self._optimize_messages(messages)
|
|
499
|
+
|
|
500
|
+
logger.info(
|
|
501
|
+
f"Headroom optimized (invoke_stream): {metrics.tokens_before} -> {metrics.tokens_after} tokens "
|
|
502
|
+
f"({metrics.savings_percent:.1f}% saved)"
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Delegate to wrapped model
|
|
506
|
+
yield from self.wrapped_model.invoke_stream(optimized_messages, **kwargs)
|
|
507
|
+
|
|
508
|
+
async def ainvoke_stream(self, messages: list[Any], **kwargs: Any) -> AsyncIterator[Any]:
|
|
509
|
+
"""Async stream invoke the wrapped model with optimized messages.
|
|
510
|
+
|
|
511
|
+
This is required by agno.models.base.Model abstract interface.
|
|
512
|
+
"""
|
|
513
|
+
# Run optimization in executor (CPU-bound)
|
|
514
|
+
loop = asyncio.get_running_loop()
|
|
515
|
+
optimized_messages, metrics = await loop.run_in_executor(
|
|
516
|
+
None, self._optimize_messages, messages
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
logger.info(
|
|
520
|
+
f"Headroom optimized (ainvoke_stream): {metrics.tokens_before} -> {metrics.tokens_after} tokens "
|
|
521
|
+
f"({metrics.savings_percent:.1f}% saved)"
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
# Delegate to wrapped model
|
|
525
|
+
if hasattr(self.wrapped_model, "ainvoke_stream"):
|
|
526
|
+
async for chunk in self.wrapped_model.ainvoke_stream(optimized_messages, **kwargs):
|
|
527
|
+
yield chunk
|
|
528
|
+
else:
|
|
529
|
+
# Fallback: wrap sync streaming
|
|
530
|
+
def _sync_stream() -> list[Any]:
|
|
531
|
+
return list(self.wrapped_model.invoke_stream(optimized_messages, **kwargs))
|
|
532
|
+
|
|
533
|
+
chunks = await loop.run_in_executor(None, _sync_stream)
|
|
534
|
+
for chunk in chunks:
|
|
535
|
+
yield chunk
|
|
536
|
+
|
|
537
|
+
def _parse_provider_response(self, response: Any, **kwargs: Any) -> Any:
|
|
538
|
+
"""Parse provider response - delegates to wrapped model.
|
|
539
|
+
|
|
540
|
+
This is required by agno.models.base.Model abstract interface.
|
|
541
|
+
"""
|
|
542
|
+
return self.wrapped_model._parse_provider_response(response, **kwargs)
|
|
543
|
+
|
|
544
|
+
def _parse_provider_response_delta(self, response: Any) -> Any:
|
|
545
|
+
"""Parse streaming response delta - delegates to wrapped model.
|
|
546
|
+
|
|
547
|
+
This is required by agno.models.base.Model abstract interface.
|
|
548
|
+
"""
|
|
549
|
+
return self.wrapped_model._parse_provider_response_delta(response)
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def optimize_messages(
|
|
553
|
+
messages: list[Any],
|
|
554
|
+
config: HeadroomConfig | None = None,
|
|
555
|
+
mode: HeadroomMode = HeadroomMode.OPTIMIZE,
|
|
556
|
+
model: str = "gpt-4o",
|
|
557
|
+
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
558
|
+
"""Standalone function to optimize Agno messages.
|
|
559
|
+
|
|
560
|
+
Use this for manual optimization when you need fine-grained control.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
messages: List of Agno Message objects or dicts
|
|
564
|
+
config: HeadroomConfig for optimization settings
|
|
565
|
+
mode: HeadroomMode (AUDIT, OPTIMIZE, or SIMULATE)
|
|
566
|
+
model: Model name for token estimation
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
Tuple of (optimized_messages, metrics_dict)
|
|
570
|
+
|
|
571
|
+
Example:
|
|
572
|
+
from headroom.integrations.agno import optimize_messages
|
|
573
|
+
|
|
574
|
+
messages = [
|
|
575
|
+
{"role": "system", "content": "You are helpful."},
|
|
576
|
+
{"role": "user", "content": "What is 2+2?"},
|
|
577
|
+
]
|
|
578
|
+
|
|
579
|
+
optimized, metrics = optimize_messages(messages)
|
|
580
|
+
print(f"Saved {metrics['tokens_saved']} tokens")
|
|
581
|
+
"""
|
|
582
|
+
_check_agno_available()
|
|
583
|
+
|
|
584
|
+
config = config or HeadroomConfig()
|
|
585
|
+
provider = OpenAIProvider()
|
|
586
|
+
pipeline = TransformPipeline(config=config, provider=provider)
|
|
587
|
+
|
|
588
|
+
# Convert to OpenAI format
|
|
589
|
+
openai_messages = []
|
|
590
|
+
for msg in messages:
|
|
591
|
+
if hasattr(msg, "role") and hasattr(msg, "content"):
|
|
592
|
+
entry: dict[str, Any] = {"role": msg.role, "content": msg.content or ""}
|
|
593
|
+
if hasattr(msg, "tool_calls") and msg.tool_calls:
|
|
594
|
+
entry["tool_calls"] = msg.tool_calls
|
|
595
|
+
if hasattr(msg, "tool_call_id") and msg.tool_call_id:
|
|
596
|
+
entry["tool_call_id"] = msg.tool_call_id
|
|
597
|
+
openai_messages.append(entry)
|
|
598
|
+
elif isinstance(msg, dict):
|
|
599
|
+
openai_messages.append(msg.copy())
|
|
600
|
+
else:
|
|
601
|
+
openai_messages.append({"role": "user", "content": str(msg)})
|
|
602
|
+
|
|
603
|
+
# Get model context limit
|
|
604
|
+
model_limit = provider.get_context_limit(model)
|
|
605
|
+
|
|
606
|
+
# Apply transforms
|
|
607
|
+
result = pipeline.apply(
|
|
608
|
+
messages=openai_messages,
|
|
609
|
+
model=model,
|
|
610
|
+
model_limit=model_limit,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
metrics = {
|
|
614
|
+
"tokens_before": result.tokens_before,
|
|
615
|
+
"tokens_after": result.tokens_after,
|
|
616
|
+
"tokens_saved": result.tokens_before - result.tokens_after,
|
|
617
|
+
"savings_percent": (
|
|
618
|
+
(result.tokens_before - result.tokens_after) / result.tokens_before * 100
|
|
619
|
+
if result.tokens_before > 0
|
|
620
|
+
else 0
|
|
621
|
+
),
|
|
622
|
+
"transforms_applied": result.transforms_applied,
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
return result.messages, metrics
|