headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,638 @@
|
|
|
1
|
+
"""LLMLingua-2 compressor for ML-based prompt compression.
|
|
2
|
+
|
|
3
|
+
This module provides integration with LLMLingua-2, a BERT-based token classifier
|
|
4
|
+
trained via GPT-4 distillation. It achieves superior compression (up to 20x)
|
|
5
|
+
while maintaining high fidelity on tool outputs and structured content.
|
|
6
|
+
|
|
7
|
+
Key Features:
|
|
8
|
+
- Token-level classification (keep/remove) using fine-tuned BERT
|
|
9
|
+
- 3-6x faster than LLMLingua-1 with better results
|
|
10
|
+
- Especially effective on tool outputs, code, and structured data
|
|
11
|
+
- Reversible compression via CCR integration
|
|
12
|
+
|
|
13
|
+
Reference:
|
|
14
|
+
LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression
|
|
15
|
+
https://arxiv.org/abs/2403.12968
|
|
16
|
+
|
|
17
|
+
Installation:
|
|
18
|
+
pip install headroom-ai[llmlingua]
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
>>> from headroom.transforms import LLMLinguaCompressor
|
|
22
|
+
>>> compressor = LLMLinguaCompressor()
|
|
23
|
+
>>> result = compressor.compress(long_tool_output)
|
|
24
|
+
>>> print(result.compressed) # Significantly reduced output
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
import threading
|
|
31
|
+
from dataclasses import dataclass, field
|
|
32
|
+
from typing import Any
|
|
33
|
+
|
|
34
|
+
from ..config import TransformResult
|
|
35
|
+
from ..tokenizer import Tokenizer
|
|
36
|
+
from .base import Transform
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
# Lazy import for optional dependency
|
|
41
|
+
_llmlingua_available: bool | None = None
|
|
42
|
+
_llmlingua_instance: Any = None
|
|
43
|
+
_llmlingua_lock = threading.Lock() # Thread safety for model access
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _check_llmlingua_available() -> bool:
|
|
47
|
+
"""Check if llmlingua package is available."""
|
|
48
|
+
global _llmlingua_available
|
|
49
|
+
if _llmlingua_available is None:
|
|
50
|
+
try:
|
|
51
|
+
import llmlingua # noqa: F401
|
|
52
|
+
|
|
53
|
+
_llmlingua_available = True
|
|
54
|
+
except ImportError:
|
|
55
|
+
_llmlingua_available = False
|
|
56
|
+
return _llmlingua_available
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _get_llmlingua_compressor(model_name: str, device: str) -> Any:
|
|
60
|
+
"""Get or create the LLMLingua compressor instance.
|
|
61
|
+
|
|
62
|
+
Uses lazy initialization and caches the instance to avoid repeated model loading.
|
|
63
|
+
Thread-safe: uses lock to prevent race conditions during model initialization.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
model_name: HuggingFace model name for the compressor.
|
|
67
|
+
device: Device to run the model on ('cuda', 'cpu', or 'auto').
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
PromptCompressor instance from llmlingua.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
ImportError: If llmlingua is not installed.
|
|
74
|
+
RuntimeError: If model loading fails.
|
|
75
|
+
"""
|
|
76
|
+
global _llmlingua_instance
|
|
77
|
+
|
|
78
|
+
if not _check_llmlingua_available():
|
|
79
|
+
raise ImportError(
|
|
80
|
+
"llmlingua is not installed. Install with: pip install headroom-ai[llmlingua]\n"
|
|
81
|
+
"Note: This requires ~2GB of disk space and ~1GB RAM for the model."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
with _llmlingua_lock:
|
|
85
|
+
# Double-check after acquiring lock
|
|
86
|
+
if _llmlingua_instance is None or _llmlingua_instance._model_name != model_name:
|
|
87
|
+
try:
|
|
88
|
+
from llmlingua import PromptCompressor
|
|
89
|
+
|
|
90
|
+
logger.info(
|
|
91
|
+
"Loading LLMLingua-2 model: %s on device: %s "
|
|
92
|
+
"(this may take 10-30s on first run)",
|
|
93
|
+
model_name,
|
|
94
|
+
device,
|
|
95
|
+
)
|
|
96
|
+
_llmlingua_instance = PromptCompressor(
|
|
97
|
+
model_name=model_name,
|
|
98
|
+
device_map=device,
|
|
99
|
+
use_llmlingua2=True, # Use LLMLingua-2 (BERT classifier)
|
|
100
|
+
)
|
|
101
|
+
# Store model name for later comparison
|
|
102
|
+
_llmlingua_instance._model_name = model_name
|
|
103
|
+
logger.info("LLMLingua-2 model loaded successfully")
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
error_msg = str(e).lower()
|
|
107
|
+
if "out of memory" in error_msg or "oom" in error_msg:
|
|
108
|
+
raise RuntimeError(
|
|
109
|
+
f"Out of memory loading LLMLingua model. Try:\n"
|
|
110
|
+
f" 1. Use device='cpu' instead of 'cuda'\n"
|
|
111
|
+
f" 2. Close other GPU applications\n"
|
|
112
|
+
f" 3. Use a smaller model\n"
|
|
113
|
+
f"Original error: {e}"
|
|
114
|
+
) from e
|
|
115
|
+
elif "not found" in error_msg or "404" in error_msg:
|
|
116
|
+
raise RuntimeError(
|
|
117
|
+
f"Model '{model_name}' not found on HuggingFace. Try:\n"
|
|
118
|
+
f" 1. Check the model name is correct\n"
|
|
119
|
+
f" 2. Use default: 'microsoft/llmlingua-2-xlm-roberta-large-meetingbank'\n"
|
|
120
|
+
f"Original error: {e}"
|
|
121
|
+
) from e
|
|
122
|
+
else:
|
|
123
|
+
raise RuntimeError(
|
|
124
|
+
f"Failed to load LLMLingua model: {e}\n"
|
|
125
|
+
f"Ensure you have sufficient disk space and memory."
|
|
126
|
+
) from e
|
|
127
|
+
|
|
128
|
+
return _llmlingua_instance
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def unload_llmlingua_model() -> bool:
|
|
132
|
+
"""Unload the LLMLingua model to free memory.
|
|
133
|
+
|
|
134
|
+
Use this when you're done with compression and want to reclaim GPU/CPU memory.
|
|
135
|
+
The model will be reloaded automatically on the next compression call.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
True if a model was unloaded, False if no model was loaded.
|
|
139
|
+
|
|
140
|
+
Example:
|
|
141
|
+
>>> from headroom.transforms import LLMLinguaCompressor, unload_llmlingua_model
|
|
142
|
+
>>> compressor = LLMLinguaCompressor()
|
|
143
|
+
>>> result = compressor.compress(content) # Model loaded here
|
|
144
|
+
>>> # ... do other work ...
|
|
145
|
+
>>> unload_llmlingua_model() # Free ~1GB of memory
|
|
146
|
+
"""
|
|
147
|
+
global _llmlingua_instance
|
|
148
|
+
|
|
149
|
+
with _llmlingua_lock:
|
|
150
|
+
if _llmlingua_instance is not None:
|
|
151
|
+
model_name = getattr(_llmlingua_instance, "_model_name", "unknown")
|
|
152
|
+
logger.info("Unloading LLMLingua model: %s", model_name)
|
|
153
|
+
|
|
154
|
+
# Clear the instance
|
|
155
|
+
_llmlingua_instance = None
|
|
156
|
+
|
|
157
|
+
# Attempt to free GPU memory if torch is available
|
|
158
|
+
try:
|
|
159
|
+
import torch
|
|
160
|
+
|
|
161
|
+
if torch.cuda.is_available():
|
|
162
|
+
torch.cuda.empty_cache()
|
|
163
|
+
logger.debug("Cleared CUDA cache")
|
|
164
|
+
except ImportError:
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def is_llmlingua_model_loaded() -> bool:
|
|
173
|
+
"""Check if an LLMLingua model is currently loaded.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
True if a model is loaded in memory, False otherwise.
|
|
177
|
+
"""
|
|
178
|
+
return _llmlingua_instance is not None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass
|
|
182
|
+
class LLMLinguaConfig:
|
|
183
|
+
"""Configuration for LLMLingua-2 compression.
|
|
184
|
+
|
|
185
|
+
Attributes:
|
|
186
|
+
model_name: HuggingFace model for the compressor. Default is the
|
|
187
|
+
LLMLingua-2 xlm-roberta-large model fine-tuned for compression.
|
|
188
|
+
device: Device to run on ('cuda', 'cpu', 'auto'). Auto will use CUDA if available.
|
|
189
|
+
target_compression_rate: Target compression ratio (e.g., 0.3 = keep 30% of tokens).
|
|
190
|
+
force_tokens: Tokens to always preserve (e.g., important keywords).
|
|
191
|
+
drop_consecutive: Whether to drop consecutive punctuation/whitespace.
|
|
192
|
+
min_tokens_for_compression: Minimum token count to trigger compression.
|
|
193
|
+
Content below this threshold is passed through unchanged.
|
|
194
|
+
enable_ccr: Whether to store originals in CCR for retrieval.
|
|
195
|
+
ccr_ttl: TTL for CCR entries in seconds.
|
|
196
|
+
|
|
197
|
+
GOTCHA: Lower target_compression_rate = more aggressive compression.
|
|
198
|
+
A rate of 0.2 means keeping only 20% of tokens.
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
# Model configuration
|
|
202
|
+
model_name: str = "microsoft/llmlingua-2-xlm-roberta-large-meetingbank"
|
|
203
|
+
device: str = "auto"
|
|
204
|
+
|
|
205
|
+
# Compression parameters
|
|
206
|
+
target_compression_rate: float = 0.3
|
|
207
|
+
force_tokens: list[str] = field(default_factory=list)
|
|
208
|
+
drop_consecutive: bool = True
|
|
209
|
+
|
|
210
|
+
# Thresholds
|
|
211
|
+
min_tokens_for_compression: int = 100
|
|
212
|
+
|
|
213
|
+
# CCR integration
|
|
214
|
+
enable_ccr: bool = True
|
|
215
|
+
ccr_ttl: int = 300 # 5 minutes
|
|
216
|
+
|
|
217
|
+
# Content type specific settings
|
|
218
|
+
code_compression_rate: float = 0.4 # More conservative for code
|
|
219
|
+
json_compression_rate: float = 0.35 # Slightly conservative for JSON
|
|
220
|
+
text_compression_rate: float = 0.25 # More aggressive for plain text
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@dataclass
|
|
224
|
+
class LLMLinguaResult:
|
|
225
|
+
"""Result of LLMLingua-2 compression.
|
|
226
|
+
|
|
227
|
+
Attributes:
|
|
228
|
+
compressed: Compressed content.
|
|
229
|
+
original: Original content before compression.
|
|
230
|
+
original_tokens: Token count of original content.
|
|
231
|
+
compressed_tokens: Token count after compression.
|
|
232
|
+
compression_ratio: Actual compression ratio achieved.
|
|
233
|
+
cache_key: CCR cache key if stored.
|
|
234
|
+
model_used: Model that performed the compression.
|
|
235
|
+
tokens_saved: Number of tokens saved.
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
compressed: str
|
|
239
|
+
original: str
|
|
240
|
+
original_tokens: int
|
|
241
|
+
compressed_tokens: int
|
|
242
|
+
compression_ratio: float
|
|
243
|
+
cache_key: str | None = None
|
|
244
|
+
model_used: str | None = None
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def tokens_saved(self) -> int:
|
|
248
|
+
"""Number of tokens saved by compression."""
|
|
249
|
+
return max(0, self.original_tokens - self.compressed_tokens)
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
def savings_percentage(self) -> float:
|
|
253
|
+
"""Percentage of tokens saved."""
|
|
254
|
+
if self.original_tokens == 0:
|
|
255
|
+
return 0.0
|
|
256
|
+
return (self.tokens_saved / self.original_tokens) * 100
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class LLMLinguaCompressor(Transform):
|
|
260
|
+
"""LLMLingua-2 based prompt compressor.
|
|
261
|
+
|
|
262
|
+
Uses a BERT-based token classifier trained via GPT-4 distillation to
|
|
263
|
+
identify and remove non-essential tokens while preserving semantic meaning.
|
|
264
|
+
|
|
265
|
+
Key advantages over statistical compression:
|
|
266
|
+
- Learned token importance from LLM feedback
|
|
267
|
+
- Better handling of context-dependent importance
|
|
268
|
+
- More aggressive compression with less information loss
|
|
269
|
+
- Especially effective on structured outputs (JSON, code, logs)
|
|
270
|
+
|
|
271
|
+
Example:
|
|
272
|
+
>>> compressor = LLMLinguaCompressor()
|
|
273
|
+
>>> result = compressor.compress(long_tool_output)
|
|
274
|
+
>>> print(f"Saved {result.tokens_saved} tokens ({result.savings_percentage:.1f}%)")
|
|
275
|
+
|
|
276
|
+
>>> # Use as a Transform in pipeline
|
|
277
|
+
>>> from headroom.transforms import TransformPipeline
|
|
278
|
+
>>> pipeline = TransformPipeline([LLMLinguaCompressor()])
|
|
279
|
+
>>> result = pipeline.apply(messages, tokenizer)
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
name: str = "llmlingua_compressor"
|
|
283
|
+
|
|
284
|
+
def __init__(self, config: LLMLinguaConfig | None = None):
|
|
285
|
+
"""Initialize LLMLingua compressor.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
config: Compression configuration. If None, uses defaults.
|
|
289
|
+
|
|
290
|
+
Note:
|
|
291
|
+
The underlying model is loaded lazily on first use to avoid
|
|
292
|
+
startup overhead when the compressor isn't used.
|
|
293
|
+
"""
|
|
294
|
+
self.config = config or LLMLinguaConfig()
|
|
295
|
+
self._compressor: Any = None # Lazy loaded
|
|
296
|
+
|
|
297
|
+
def compress(
|
|
298
|
+
self,
|
|
299
|
+
content: str,
|
|
300
|
+
context: str = "",
|
|
301
|
+
content_type: str | None = None,
|
|
302
|
+
) -> LLMLinguaResult:
|
|
303
|
+
"""Compress content using LLMLingua-2.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
content: Content to compress.
|
|
307
|
+
context: Optional context for relevance-aware compression.
|
|
308
|
+
content_type: Type of content ('code', 'json', 'text').
|
|
309
|
+
If None, auto-detected.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
LLMLinguaResult with compressed content and metadata.
|
|
313
|
+
|
|
314
|
+
Raises:
|
|
315
|
+
ImportError: If llmlingua is not installed.
|
|
316
|
+
"""
|
|
317
|
+
# Check availability
|
|
318
|
+
if not _check_llmlingua_available():
|
|
319
|
+
logger.warning(
|
|
320
|
+
"LLMLingua not available. Install with: pip install headroom-ai[llmlingua]"
|
|
321
|
+
)
|
|
322
|
+
return LLMLinguaResult(
|
|
323
|
+
compressed=content,
|
|
324
|
+
original=content,
|
|
325
|
+
original_tokens=len(content.split()), # Rough estimate
|
|
326
|
+
compressed_tokens=len(content.split()),
|
|
327
|
+
compression_ratio=1.0,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Estimate token count (rough)
|
|
331
|
+
estimated_tokens = len(content.split())
|
|
332
|
+
|
|
333
|
+
# Skip compression for small content
|
|
334
|
+
if estimated_tokens < self.config.min_tokens_for_compression:
|
|
335
|
+
return LLMLinguaResult(
|
|
336
|
+
compressed=content,
|
|
337
|
+
original=content,
|
|
338
|
+
original_tokens=estimated_tokens,
|
|
339
|
+
compressed_tokens=estimated_tokens,
|
|
340
|
+
compression_ratio=1.0,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Get compression rate based on content type
|
|
344
|
+
compression_rate = self._get_compression_rate(content, content_type)
|
|
345
|
+
|
|
346
|
+
# Get or initialize compressor
|
|
347
|
+
device = self._resolve_device()
|
|
348
|
+
compressor = _get_llmlingua_compressor(self.config.model_name, device)
|
|
349
|
+
|
|
350
|
+
# Prepare force tokens
|
|
351
|
+
force_tokens = list(self.config.force_tokens)
|
|
352
|
+
|
|
353
|
+
# Add context words as force tokens if provided
|
|
354
|
+
if context:
|
|
355
|
+
context_words = [w for w in context.split() if len(w) > 3]
|
|
356
|
+
force_tokens.extend(context_words[:10]) # Limit to avoid overhead
|
|
357
|
+
|
|
358
|
+
# Perform compression
|
|
359
|
+
try:
|
|
360
|
+
result = compressor.compress_prompt(
|
|
361
|
+
context=[content], # LLMLingua expects a list of context strings
|
|
362
|
+
rate=compression_rate,
|
|
363
|
+
force_tokens=force_tokens if force_tokens else [],
|
|
364
|
+
drop_consecutive=self.config.drop_consecutive,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
compressed = result.get("compressed_prompt", content)
|
|
368
|
+
original_tokens = result.get("origin_tokens", estimated_tokens)
|
|
369
|
+
compressed_tokens = result.get("compressed_tokens", len(compressed.split()))
|
|
370
|
+
|
|
371
|
+
except Exception as e:
|
|
372
|
+
logger.warning("LLMLingua compression failed: %s", e)
|
|
373
|
+
return LLMLinguaResult(
|
|
374
|
+
compressed=content,
|
|
375
|
+
original=content,
|
|
376
|
+
original_tokens=estimated_tokens,
|
|
377
|
+
compressed_tokens=estimated_tokens,
|
|
378
|
+
compression_ratio=1.0,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Calculate actual ratio
|
|
382
|
+
ratio = compressed_tokens / max(original_tokens, 1)
|
|
383
|
+
|
|
384
|
+
# Store in CCR if enabled
|
|
385
|
+
cache_key = None
|
|
386
|
+
if self.config.enable_ccr and ratio < 0.8:
|
|
387
|
+
cache_key = self._store_in_ccr(content, compressed, original_tokens)
|
|
388
|
+
if cache_key:
|
|
389
|
+
# Use standard CCR marker format for CCRToolInjector detection
|
|
390
|
+
compressed += f"\n[{original_tokens} items compressed to {compressed_tokens}. Retrieve more: hash={cache_key}]"
|
|
391
|
+
|
|
392
|
+
return LLMLinguaResult(
|
|
393
|
+
compressed=compressed,
|
|
394
|
+
original=content,
|
|
395
|
+
original_tokens=original_tokens,
|
|
396
|
+
compressed_tokens=compressed_tokens,
|
|
397
|
+
compression_ratio=ratio,
|
|
398
|
+
cache_key=cache_key,
|
|
399
|
+
model_used=self.config.model_name,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
def apply(
|
|
403
|
+
self,
|
|
404
|
+
messages: list[dict[str, Any]],
|
|
405
|
+
tokenizer: Tokenizer,
|
|
406
|
+
**kwargs: Any,
|
|
407
|
+
) -> TransformResult:
|
|
408
|
+
"""Apply LLMLingua compression to messages.
|
|
409
|
+
|
|
410
|
+
This method implements the Transform interface for use in pipelines.
|
|
411
|
+
It compresses tool outputs and long assistant/user messages.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
messages: List of message dicts to transform.
|
|
415
|
+
tokenizer: Tokenizer for accurate token counting.
|
|
416
|
+
**kwargs: Additional arguments (e.g., 'context' for relevance).
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
TransformResult with compressed messages and metadata.
|
|
420
|
+
"""
|
|
421
|
+
tokens_before = sum(tokenizer.count_text(str(m.get("content", ""))) for m in messages)
|
|
422
|
+
context = kwargs.get("context", "")
|
|
423
|
+
|
|
424
|
+
transformed_messages = []
|
|
425
|
+
transforms_applied = []
|
|
426
|
+
warnings: list[str] = []
|
|
427
|
+
|
|
428
|
+
for message in messages:
|
|
429
|
+
role = message.get("role", "")
|
|
430
|
+
content = message.get("content", "")
|
|
431
|
+
|
|
432
|
+
# Skip non-string content (multimodal messages with images)
|
|
433
|
+
if not isinstance(content, str):
|
|
434
|
+
transformed_messages.append(message)
|
|
435
|
+
continue
|
|
436
|
+
|
|
437
|
+
# Compress tool results (highest value compression)
|
|
438
|
+
if role == "tool" and content:
|
|
439
|
+
result = self.compress(content, context=context, content_type="json")
|
|
440
|
+
if result.compression_ratio < 0.9:
|
|
441
|
+
transformed_messages.append({**message, "content": result.compressed})
|
|
442
|
+
transforms_applied.append(f"llmlingua:tool:{result.compression_ratio:.2f}")
|
|
443
|
+
else:
|
|
444
|
+
transformed_messages.append(message)
|
|
445
|
+
|
|
446
|
+
# Compress long assistant messages (tool outputs often embedded)
|
|
447
|
+
elif role == "assistant" and len(content) > 500:
|
|
448
|
+
result = self.compress(content, context=context)
|
|
449
|
+
if result.compression_ratio < 0.9:
|
|
450
|
+
transformed_messages.append({**message, "content": result.compressed})
|
|
451
|
+
transforms_applied.append(f"llmlingua:assistant:{result.compression_ratio:.2f}")
|
|
452
|
+
else:
|
|
453
|
+
transformed_messages.append(message)
|
|
454
|
+
|
|
455
|
+
# Pass through other messages
|
|
456
|
+
else:
|
|
457
|
+
transformed_messages.append(message)
|
|
458
|
+
|
|
459
|
+
tokens_after = sum(
|
|
460
|
+
tokenizer.count_text(str(m.get("content", ""))) for m in transformed_messages
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
# Add warning if llmlingua not available
|
|
464
|
+
if not _check_llmlingua_available():
|
|
465
|
+
warnings.append(
|
|
466
|
+
"LLMLingua not installed. Install with: pip install headroom-ai[llmlingua]"
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
return TransformResult(
|
|
470
|
+
messages=transformed_messages,
|
|
471
|
+
tokens_before=tokens_before,
|
|
472
|
+
tokens_after=tokens_after,
|
|
473
|
+
transforms_applied=transforms_applied if transforms_applied else ["llmlingua:noop"],
|
|
474
|
+
warnings=warnings,
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
def should_apply(
|
|
478
|
+
self,
|
|
479
|
+
messages: list[dict[str, Any]],
|
|
480
|
+
tokenizer: Tokenizer,
|
|
481
|
+
**kwargs: Any,
|
|
482
|
+
) -> bool:
|
|
483
|
+
"""Check if LLMLingua compression should be applied.
|
|
484
|
+
|
|
485
|
+
Returns True if:
|
|
486
|
+
- LLMLingua is available, AND
|
|
487
|
+
- Total token count exceeds minimum threshold
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
messages: Messages to check.
|
|
491
|
+
tokenizer: Tokenizer for counting.
|
|
492
|
+
**kwargs: Additional arguments.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
True if compression should be applied.
|
|
496
|
+
"""
|
|
497
|
+
if not _check_llmlingua_available():
|
|
498
|
+
return False
|
|
499
|
+
|
|
500
|
+
total_tokens = sum(tokenizer.count_text(str(m.get("content", ""))) for m in messages)
|
|
501
|
+
return total_tokens >= self.config.min_tokens_for_compression
|
|
502
|
+
|
|
503
|
+
def _get_compression_rate(
|
|
504
|
+
self,
|
|
505
|
+
content: str,
|
|
506
|
+
content_type: str | None,
|
|
507
|
+
) -> float:
|
|
508
|
+
"""Get appropriate compression rate based on content type.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
content: Content to analyze.
|
|
512
|
+
content_type: Explicit content type or None for auto-detection.
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
Target compression rate for this content.
|
|
516
|
+
"""
|
|
517
|
+
if content_type == "code":
|
|
518
|
+
return self.config.code_compression_rate
|
|
519
|
+
elif content_type == "json":
|
|
520
|
+
return self.config.json_compression_rate
|
|
521
|
+
elif content_type == "text":
|
|
522
|
+
return self.config.text_compression_rate
|
|
523
|
+
|
|
524
|
+
# Auto-detect content type
|
|
525
|
+
if self._looks_like_json(content):
|
|
526
|
+
return self.config.json_compression_rate
|
|
527
|
+
elif self._looks_like_code(content):
|
|
528
|
+
return self.config.code_compression_rate
|
|
529
|
+
else:
|
|
530
|
+
return self.config.text_compression_rate
|
|
531
|
+
|
|
532
|
+
def _looks_like_json(self, content: str) -> bool:
|
|
533
|
+
"""Check if content appears to be JSON."""
|
|
534
|
+
stripped = content.strip()
|
|
535
|
+
return (stripped.startswith("{") and stripped.endswith("}")) or (
|
|
536
|
+
stripped.startswith("[") and stripped.endswith("]")
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
def _looks_like_code(self, content: str) -> bool:
|
|
540
|
+
"""Check if content appears to be code."""
|
|
541
|
+
code_indicators = [
|
|
542
|
+
"def ",
|
|
543
|
+
"class ",
|
|
544
|
+
"function ",
|
|
545
|
+
"import ",
|
|
546
|
+
"from ",
|
|
547
|
+
"const ",
|
|
548
|
+
"let ",
|
|
549
|
+
"var ",
|
|
550
|
+
"public ",
|
|
551
|
+
"private ",
|
|
552
|
+
"async ",
|
|
553
|
+
"await ",
|
|
554
|
+
"return ",
|
|
555
|
+
"if (",
|
|
556
|
+
"for (",
|
|
557
|
+
"while (",
|
|
558
|
+
]
|
|
559
|
+
return any(indicator in content for indicator in code_indicators)
|
|
560
|
+
|
|
561
|
+
def _resolve_device(self) -> str:
|
|
562
|
+
"""Resolve 'auto' device to actual device."""
|
|
563
|
+
if self.config.device != "auto":
|
|
564
|
+
return self.config.device
|
|
565
|
+
|
|
566
|
+
try:
|
|
567
|
+
import torch
|
|
568
|
+
|
|
569
|
+
if torch.cuda.is_available():
|
|
570
|
+
return "cuda"
|
|
571
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
572
|
+
return "mps"
|
|
573
|
+
except ImportError:
|
|
574
|
+
pass
|
|
575
|
+
|
|
576
|
+
return "cpu"
|
|
577
|
+
|
|
578
|
+
def _store_in_ccr(
|
|
579
|
+
self,
|
|
580
|
+
original: str,
|
|
581
|
+
compressed: str,
|
|
582
|
+
original_tokens: int,
|
|
583
|
+
) -> str | None:
|
|
584
|
+
"""Store original content in CCR for later retrieval.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
original: Original content before compression.
|
|
588
|
+
compressed: Compressed content.
|
|
589
|
+
original_tokens: Token count of original.
|
|
590
|
+
|
|
591
|
+
Returns:
|
|
592
|
+
Cache key if stored successfully, None otherwise.
|
|
593
|
+
"""
|
|
594
|
+
try:
|
|
595
|
+
from ..cache.compression_store import get_compression_store
|
|
596
|
+
|
|
597
|
+
store = get_compression_store()
|
|
598
|
+
return store.store(
|
|
599
|
+
original,
|
|
600
|
+
compressed,
|
|
601
|
+
original_tokens=original_tokens,
|
|
602
|
+
compressed_tokens=len(compressed.split()),
|
|
603
|
+
compression_strategy="llmlingua2",
|
|
604
|
+
)
|
|
605
|
+
except ImportError:
|
|
606
|
+
return None
|
|
607
|
+
except Exception as e:
|
|
608
|
+
logger.debug("CCR storage failed: %s", e)
|
|
609
|
+
return None
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def compress_with_llmlingua(
|
|
613
|
+
content: str,
|
|
614
|
+
compression_rate: float = 0.3,
|
|
615
|
+
context: str = "",
|
|
616
|
+
model_name: str | None = None,
|
|
617
|
+
) -> str:
|
|
618
|
+
"""Convenience function for one-off compression.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
content: Content to compress.
|
|
622
|
+
compression_rate: Target compression rate (0.0-1.0).
|
|
623
|
+
context: Optional context for relevance-aware compression.
|
|
624
|
+
model_name: Optional model name override.
|
|
625
|
+
|
|
626
|
+
Returns:
|
|
627
|
+
Compressed content string.
|
|
628
|
+
|
|
629
|
+
Example:
|
|
630
|
+
>>> compressed = compress_with_llmlingua(long_output, compression_rate=0.2)
|
|
631
|
+
"""
|
|
632
|
+
config = LLMLinguaConfig(target_compression_rate=compression_rate)
|
|
633
|
+
if model_name:
|
|
634
|
+
config.model_name = model_name
|
|
635
|
+
|
|
636
|
+
compressor = LLMLinguaCompressor(config)
|
|
637
|
+
result = compressor.compress(content, context=context)
|
|
638
|
+
return result.compressed
|