headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,638 @@
1
+ """LLMLingua-2 compressor for ML-based prompt compression.
2
+
3
+ This module provides integration with LLMLingua-2, a BERT-based token classifier
4
+ trained via GPT-4 distillation. It achieves superior compression (up to 20x)
5
+ while maintaining high fidelity on tool outputs and structured content.
6
+
7
+ Key Features:
8
+ - Token-level classification (keep/remove) using fine-tuned BERT
9
+ - 3-6x faster than LLMLingua-1 with better results
10
+ - Especially effective on tool outputs, code, and structured data
11
+ - Reversible compression via CCR integration
12
+
13
+ Reference:
14
+ LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression
15
+ https://arxiv.org/abs/2403.12968
16
+
17
+ Installation:
18
+ pip install headroom-ai[llmlingua]
19
+
20
+ Usage:
21
+ >>> from headroom.transforms import LLMLinguaCompressor
22
+ >>> compressor = LLMLinguaCompressor()
23
+ >>> result = compressor.compress(long_tool_output)
24
+ >>> print(result.compressed) # Significantly reduced output
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ import threading
31
+ from dataclasses import dataclass, field
32
+ from typing import Any
33
+
34
+ from ..config import TransformResult
35
+ from ..tokenizer import Tokenizer
36
+ from .base import Transform
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Lazy import for optional dependency
41
+ _llmlingua_available: bool | None = None
42
+ _llmlingua_instance: Any = None
43
+ _llmlingua_lock = threading.Lock() # Thread safety for model access
44
+
45
+
46
+ def _check_llmlingua_available() -> bool:
47
+ """Check if llmlingua package is available."""
48
+ global _llmlingua_available
49
+ if _llmlingua_available is None:
50
+ try:
51
+ import llmlingua # noqa: F401
52
+
53
+ _llmlingua_available = True
54
+ except ImportError:
55
+ _llmlingua_available = False
56
+ return _llmlingua_available
57
+
58
+
59
+ def _get_llmlingua_compressor(model_name: str, device: str) -> Any:
60
+ """Get or create the LLMLingua compressor instance.
61
+
62
+ Uses lazy initialization and caches the instance to avoid repeated model loading.
63
+ Thread-safe: uses lock to prevent race conditions during model initialization.
64
+
65
+ Args:
66
+ model_name: HuggingFace model name for the compressor.
67
+ device: Device to run the model on ('cuda', 'cpu', or 'auto').
68
+
69
+ Returns:
70
+ PromptCompressor instance from llmlingua.
71
+
72
+ Raises:
73
+ ImportError: If llmlingua is not installed.
74
+ RuntimeError: If model loading fails.
75
+ """
76
+ global _llmlingua_instance
77
+
78
+ if not _check_llmlingua_available():
79
+ raise ImportError(
80
+ "llmlingua is not installed. Install with: pip install headroom-ai[llmlingua]\n"
81
+ "Note: This requires ~2GB of disk space and ~1GB RAM for the model."
82
+ )
83
+
84
+ with _llmlingua_lock:
85
+ # Double-check after acquiring lock
86
+ if _llmlingua_instance is None or _llmlingua_instance._model_name != model_name:
87
+ try:
88
+ from llmlingua import PromptCompressor
89
+
90
+ logger.info(
91
+ "Loading LLMLingua-2 model: %s on device: %s "
92
+ "(this may take 10-30s on first run)",
93
+ model_name,
94
+ device,
95
+ )
96
+ _llmlingua_instance = PromptCompressor(
97
+ model_name=model_name,
98
+ device_map=device,
99
+ use_llmlingua2=True, # Use LLMLingua-2 (BERT classifier)
100
+ )
101
+ # Store model name for later comparison
102
+ _llmlingua_instance._model_name = model_name
103
+ logger.info("LLMLingua-2 model loaded successfully")
104
+
105
+ except Exception as e:
106
+ error_msg = str(e).lower()
107
+ if "out of memory" in error_msg or "oom" in error_msg:
108
+ raise RuntimeError(
109
+ f"Out of memory loading LLMLingua model. Try:\n"
110
+ f" 1. Use device='cpu' instead of 'cuda'\n"
111
+ f" 2. Close other GPU applications\n"
112
+ f" 3. Use a smaller model\n"
113
+ f"Original error: {e}"
114
+ ) from e
115
+ elif "not found" in error_msg or "404" in error_msg:
116
+ raise RuntimeError(
117
+ f"Model '{model_name}' not found on HuggingFace. Try:\n"
118
+ f" 1. Check the model name is correct\n"
119
+ f" 2. Use default: 'microsoft/llmlingua-2-xlm-roberta-large-meetingbank'\n"
120
+ f"Original error: {e}"
121
+ ) from e
122
+ else:
123
+ raise RuntimeError(
124
+ f"Failed to load LLMLingua model: {e}\n"
125
+ f"Ensure you have sufficient disk space and memory."
126
+ ) from e
127
+
128
+ return _llmlingua_instance
129
+
130
+
131
+ def unload_llmlingua_model() -> bool:
132
+ """Unload the LLMLingua model to free memory.
133
+
134
+ Use this when you're done with compression and want to reclaim GPU/CPU memory.
135
+ The model will be reloaded automatically on the next compression call.
136
+
137
+ Returns:
138
+ True if a model was unloaded, False if no model was loaded.
139
+
140
+ Example:
141
+ >>> from headroom.transforms import LLMLinguaCompressor, unload_llmlingua_model
142
+ >>> compressor = LLMLinguaCompressor()
143
+ >>> result = compressor.compress(content) # Model loaded here
144
+ >>> # ... do other work ...
145
+ >>> unload_llmlingua_model() # Free ~1GB of memory
146
+ """
147
+ global _llmlingua_instance
148
+
149
+ with _llmlingua_lock:
150
+ if _llmlingua_instance is not None:
151
+ model_name = getattr(_llmlingua_instance, "_model_name", "unknown")
152
+ logger.info("Unloading LLMLingua model: %s", model_name)
153
+
154
+ # Clear the instance
155
+ _llmlingua_instance = None
156
+
157
+ # Attempt to free GPU memory if torch is available
158
+ try:
159
+ import torch
160
+
161
+ if torch.cuda.is_available():
162
+ torch.cuda.empty_cache()
163
+ logger.debug("Cleared CUDA cache")
164
+ except ImportError:
165
+ pass
166
+
167
+ return True
168
+
169
+ return False
170
+
171
+
172
+ def is_llmlingua_model_loaded() -> bool:
173
+ """Check if an LLMLingua model is currently loaded.
174
+
175
+ Returns:
176
+ True if a model is loaded in memory, False otherwise.
177
+ """
178
+ return _llmlingua_instance is not None
179
+
180
+
181
+ @dataclass
182
+ class LLMLinguaConfig:
183
+ """Configuration for LLMLingua-2 compression.
184
+
185
+ Attributes:
186
+ model_name: HuggingFace model for the compressor. Default is the
187
+ LLMLingua-2 xlm-roberta-large model fine-tuned for compression.
188
+ device: Device to run on ('cuda', 'cpu', 'auto'). Auto will use CUDA if available.
189
+ target_compression_rate: Target compression ratio (e.g., 0.3 = keep 30% of tokens).
190
+ force_tokens: Tokens to always preserve (e.g., important keywords).
191
+ drop_consecutive: Whether to drop consecutive punctuation/whitespace.
192
+ min_tokens_for_compression: Minimum token count to trigger compression.
193
+ Content below this threshold is passed through unchanged.
194
+ enable_ccr: Whether to store originals in CCR for retrieval.
195
+ ccr_ttl: TTL for CCR entries in seconds.
196
+
197
+ GOTCHA: Lower target_compression_rate = more aggressive compression.
198
+ A rate of 0.2 means keeping only 20% of tokens.
199
+ """
200
+
201
+ # Model configuration
202
+ model_name: str = "microsoft/llmlingua-2-xlm-roberta-large-meetingbank"
203
+ device: str = "auto"
204
+
205
+ # Compression parameters
206
+ target_compression_rate: float = 0.3
207
+ force_tokens: list[str] = field(default_factory=list)
208
+ drop_consecutive: bool = True
209
+
210
+ # Thresholds
211
+ min_tokens_for_compression: int = 100
212
+
213
+ # CCR integration
214
+ enable_ccr: bool = True
215
+ ccr_ttl: int = 300 # 5 minutes
216
+
217
+ # Content type specific settings
218
+ code_compression_rate: float = 0.4 # More conservative for code
219
+ json_compression_rate: float = 0.35 # Slightly conservative for JSON
220
+ text_compression_rate: float = 0.25 # More aggressive for plain text
221
+
222
+
223
+ @dataclass
224
+ class LLMLinguaResult:
225
+ """Result of LLMLingua-2 compression.
226
+
227
+ Attributes:
228
+ compressed: Compressed content.
229
+ original: Original content before compression.
230
+ original_tokens: Token count of original content.
231
+ compressed_tokens: Token count after compression.
232
+ compression_ratio: Actual compression ratio achieved.
233
+ cache_key: CCR cache key if stored.
234
+ model_used: Model that performed the compression.
235
+ tokens_saved: Number of tokens saved.
236
+ """
237
+
238
+ compressed: str
239
+ original: str
240
+ original_tokens: int
241
+ compressed_tokens: int
242
+ compression_ratio: float
243
+ cache_key: str | None = None
244
+ model_used: str | None = None
245
+
246
+ @property
247
+ def tokens_saved(self) -> int:
248
+ """Number of tokens saved by compression."""
249
+ return max(0, self.original_tokens - self.compressed_tokens)
250
+
251
+ @property
252
+ def savings_percentage(self) -> float:
253
+ """Percentage of tokens saved."""
254
+ if self.original_tokens == 0:
255
+ return 0.0
256
+ return (self.tokens_saved / self.original_tokens) * 100
257
+
258
+
259
+ class LLMLinguaCompressor(Transform):
260
+ """LLMLingua-2 based prompt compressor.
261
+
262
+ Uses a BERT-based token classifier trained via GPT-4 distillation to
263
+ identify and remove non-essential tokens while preserving semantic meaning.
264
+
265
+ Key advantages over statistical compression:
266
+ - Learned token importance from LLM feedback
267
+ - Better handling of context-dependent importance
268
+ - More aggressive compression with less information loss
269
+ - Especially effective on structured outputs (JSON, code, logs)
270
+
271
+ Example:
272
+ >>> compressor = LLMLinguaCompressor()
273
+ >>> result = compressor.compress(long_tool_output)
274
+ >>> print(f"Saved {result.tokens_saved} tokens ({result.savings_percentage:.1f}%)")
275
+
276
+ >>> # Use as a Transform in pipeline
277
+ >>> from headroom.transforms import TransformPipeline
278
+ >>> pipeline = TransformPipeline([LLMLinguaCompressor()])
279
+ >>> result = pipeline.apply(messages, tokenizer)
280
+ """
281
+
282
+ name: str = "llmlingua_compressor"
283
+
284
+ def __init__(self, config: LLMLinguaConfig | None = None):
285
+ """Initialize LLMLingua compressor.
286
+
287
+ Args:
288
+ config: Compression configuration. If None, uses defaults.
289
+
290
+ Note:
291
+ The underlying model is loaded lazily on first use to avoid
292
+ startup overhead when the compressor isn't used.
293
+ """
294
+ self.config = config or LLMLinguaConfig()
295
+ self._compressor: Any = None # Lazy loaded
296
+
297
+ def compress(
298
+ self,
299
+ content: str,
300
+ context: str = "",
301
+ content_type: str | None = None,
302
+ ) -> LLMLinguaResult:
303
+ """Compress content using LLMLingua-2.
304
+
305
+ Args:
306
+ content: Content to compress.
307
+ context: Optional context for relevance-aware compression.
308
+ content_type: Type of content ('code', 'json', 'text').
309
+ If None, auto-detected.
310
+
311
+ Returns:
312
+ LLMLinguaResult with compressed content and metadata.
313
+
314
+ Raises:
315
+ ImportError: If llmlingua is not installed.
316
+ """
317
+ # Check availability
318
+ if not _check_llmlingua_available():
319
+ logger.warning(
320
+ "LLMLingua not available. Install with: pip install headroom-ai[llmlingua]"
321
+ )
322
+ return LLMLinguaResult(
323
+ compressed=content,
324
+ original=content,
325
+ original_tokens=len(content.split()), # Rough estimate
326
+ compressed_tokens=len(content.split()),
327
+ compression_ratio=1.0,
328
+ )
329
+
330
+ # Estimate token count (rough)
331
+ estimated_tokens = len(content.split())
332
+
333
+ # Skip compression for small content
334
+ if estimated_tokens < self.config.min_tokens_for_compression:
335
+ return LLMLinguaResult(
336
+ compressed=content,
337
+ original=content,
338
+ original_tokens=estimated_tokens,
339
+ compressed_tokens=estimated_tokens,
340
+ compression_ratio=1.0,
341
+ )
342
+
343
+ # Get compression rate based on content type
344
+ compression_rate = self._get_compression_rate(content, content_type)
345
+
346
+ # Get or initialize compressor
347
+ device = self._resolve_device()
348
+ compressor = _get_llmlingua_compressor(self.config.model_name, device)
349
+
350
+ # Prepare force tokens
351
+ force_tokens = list(self.config.force_tokens)
352
+
353
+ # Add context words as force tokens if provided
354
+ if context:
355
+ context_words = [w for w in context.split() if len(w) > 3]
356
+ force_tokens.extend(context_words[:10]) # Limit to avoid overhead
357
+
358
+ # Perform compression
359
+ try:
360
+ result = compressor.compress_prompt(
361
+ context=[content], # LLMLingua expects a list of context strings
362
+ rate=compression_rate,
363
+ force_tokens=force_tokens if force_tokens else [],
364
+ drop_consecutive=self.config.drop_consecutive,
365
+ )
366
+
367
+ compressed = result.get("compressed_prompt", content)
368
+ original_tokens = result.get("origin_tokens", estimated_tokens)
369
+ compressed_tokens = result.get("compressed_tokens", len(compressed.split()))
370
+
371
+ except Exception as e:
372
+ logger.warning("LLMLingua compression failed: %s", e)
373
+ return LLMLinguaResult(
374
+ compressed=content,
375
+ original=content,
376
+ original_tokens=estimated_tokens,
377
+ compressed_tokens=estimated_tokens,
378
+ compression_ratio=1.0,
379
+ )
380
+
381
+ # Calculate actual ratio
382
+ ratio = compressed_tokens / max(original_tokens, 1)
383
+
384
+ # Store in CCR if enabled
385
+ cache_key = None
386
+ if self.config.enable_ccr and ratio < 0.8:
387
+ cache_key = self._store_in_ccr(content, compressed, original_tokens)
388
+ if cache_key:
389
+ # Use standard CCR marker format for CCRToolInjector detection
390
+ compressed += f"\n[{original_tokens} items compressed to {compressed_tokens}. Retrieve more: hash={cache_key}]"
391
+
392
+ return LLMLinguaResult(
393
+ compressed=compressed,
394
+ original=content,
395
+ original_tokens=original_tokens,
396
+ compressed_tokens=compressed_tokens,
397
+ compression_ratio=ratio,
398
+ cache_key=cache_key,
399
+ model_used=self.config.model_name,
400
+ )
401
+
402
+ def apply(
403
+ self,
404
+ messages: list[dict[str, Any]],
405
+ tokenizer: Tokenizer,
406
+ **kwargs: Any,
407
+ ) -> TransformResult:
408
+ """Apply LLMLingua compression to messages.
409
+
410
+ This method implements the Transform interface for use in pipelines.
411
+ It compresses tool outputs and long assistant/user messages.
412
+
413
+ Args:
414
+ messages: List of message dicts to transform.
415
+ tokenizer: Tokenizer for accurate token counting.
416
+ **kwargs: Additional arguments (e.g., 'context' for relevance).
417
+
418
+ Returns:
419
+ TransformResult with compressed messages and metadata.
420
+ """
421
+ tokens_before = sum(tokenizer.count_text(str(m.get("content", ""))) for m in messages)
422
+ context = kwargs.get("context", "")
423
+
424
+ transformed_messages = []
425
+ transforms_applied = []
426
+ warnings: list[str] = []
427
+
428
+ for message in messages:
429
+ role = message.get("role", "")
430
+ content = message.get("content", "")
431
+
432
+ # Skip non-string content (multimodal messages with images)
433
+ if not isinstance(content, str):
434
+ transformed_messages.append(message)
435
+ continue
436
+
437
+ # Compress tool results (highest value compression)
438
+ if role == "tool" and content:
439
+ result = self.compress(content, context=context, content_type="json")
440
+ if result.compression_ratio < 0.9:
441
+ transformed_messages.append({**message, "content": result.compressed})
442
+ transforms_applied.append(f"llmlingua:tool:{result.compression_ratio:.2f}")
443
+ else:
444
+ transformed_messages.append(message)
445
+
446
+ # Compress long assistant messages (tool outputs often embedded)
447
+ elif role == "assistant" and len(content) > 500:
448
+ result = self.compress(content, context=context)
449
+ if result.compression_ratio < 0.9:
450
+ transformed_messages.append({**message, "content": result.compressed})
451
+ transforms_applied.append(f"llmlingua:assistant:{result.compression_ratio:.2f}")
452
+ else:
453
+ transformed_messages.append(message)
454
+
455
+ # Pass through other messages
456
+ else:
457
+ transformed_messages.append(message)
458
+
459
+ tokens_after = sum(
460
+ tokenizer.count_text(str(m.get("content", ""))) for m in transformed_messages
461
+ )
462
+
463
+ # Add warning if llmlingua not available
464
+ if not _check_llmlingua_available():
465
+ warnings.append(
466
+ "LLMLingua not installed. Install with: pip install headroom-ai[llmlingua]"
467
+ )
468
+
469
+ return TransformResult(
470
+ messages=transformed_messages,
471
+ tokens_before=tokens_before,
472
+ tokens_after=tokens_after,
473
+ transforms_applied=transforms_applied if transforms_applied else ["llmlingua:noop"],
474
+ warnings=warnings,
475
+ )
476
+
477
+ def should_apply(
478
+ self,
479
+ messages: list[dict[str, Any]],
480
+ tokenizer: Tokenizer,
481
+ **kwargs: Any,
482
+ ) -> bool:
483
+ """Check if LLMLingua compression should be applied.
484
+
485
+ Returns True if:
486
+ - LLMLingua is available, AND
487
+ - Total token count exceeds minimum threshold
488
+
489
+ Args:
490
+ messages: Messages to check.
491
+ tokenizer: Tokenizer for counting.
492
+ **kwargs: Additional arguments.
493
+
494
+ Returns:
495
+ True if compression should be applied.
496
+ """
497
+ if not _check_llmlingua_available():
498
+ return False
499
+
500
+ total_tokens = sum(tokenizer.count_text(str(m.get("content", ""))) for m in messages)
501
+ return total_tokens >= self.config.min_tokens_for_compression
502
+
503
+ def _get_compression_rate(
504
+ self,
505
+ content: str,
506
+ content_type: str | None,
507
+ ) -> float:
508
+ """Get appropriate compression rate based on content type.
509
+
510
+ Args:
511
+ content: Content to analyze.
512
+ content_type: Explicit content type or None for auto-detection.
513
+
514
+ Returns:
515
+ Target compression rate for this content.
516
+ """
517
+ if content_type == "code":
518
+ return self.config.code_compression_rate
519
+ elif content_type == "json":
520
+ return self.config.json_compression_rate
521
+ elif content_type == "text":
522
+ return self.config.text_compression_rate
523
+
524
+ # Auto-detect content type
525
+ if self._looks_like_json(content):
526
+ return self.config.json_compression_rate
527
+ elif self._looks_like_code(content):
528
+ return self.config.code_compression_rate
529
+ else:
530
+ return self.config.text_compression_rate
531
+
532
+ def _looks_like_json(self, content: str) -> bool:
533
+ """Check if content appears to be JSON."""
534
+ stripped = content.strip()
535
+ return (stripped.startswith("{") and stripped.endswith("}")) or (
536
+ stripped.startswith("[") and stripped.endswith("]")
537
+ )
538
+
539
+ def _looks_like_code(self, content: str) -> bool:
540
+ """Check if content appears to be code."""
541
+ code_indicators = [
542
+ "def ",
543
+ "class ",
544
+ "function ",
545
+ "import ",
546
+ "from ",
547
+ "const ",
548
+ "let ",
549
+ "var ",
550
+ "public ",
551
+ "private ",
552
+ "async ",
553
+ "await ",
554
+ "return ",
555
+ "if (",
556
+ "for (",
557
+ "while (",
558
+ ]
559
+ return any(indicator in content for indicator in code_indicators)
560
+
561
+ def _resolve_device(self) -> str:
562
+ """Resolve 'auto' device to actual device."""
563
+ if self.config.device != "auto":
564
+ return self.config.device
565
+
566
+ try:
567
+ import torch
568
+
569
+ if torch.cuda.is_available():
570
+ return "cuda"
571
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
572
+ return "mps"
573
+ except ImportError:
574
+ pass
575
+
576
+ return "cpu"
577
+
578
+ def _store_in_ccr(
579
+ self,
580
+ original: str,
581
+ compressed: str,
582
+ original_tokens: int,
583
+ ) -> str | None:
584
+ """Store original content in CCR for later retrieval.
585
+
586
+ Args:
587
+ original: Original content before compression.
588
+ compressed: Compressed content.
589
+ original_tokens: Token count of original.
590
+
591
+ Returns:
592
+ Cache key if stored successfully, None otherwise.
593
+ """
594
+ try:
595
+ from ..cache.compression_store import get_compression_store
596
+
597
+ store = get_compression_store()
598
+ return store.store(
599
+ original,
600
+ compressed,
601
+ original_tokens=original_tokens,
602
+ compressed_tokens=len(compressed.split()),
603
+ compression_strategy="llmlingua2",
604
+ )
605
+ except ImportError:
606
+ return None
607
+ except Exception as e:
608
+ logger.debug("CCR storage failed: %s", e)
609
+ return None
610
+
611
+
612
+ def compress_with_llmlingua(
613
+ content: str,
614
+ compression_rate: float = 0.3,
615
+ context: str = "",
616
+ model_name: str | None = None,
617
+ ) -> str:
618
+ """Convenience function for one-off compression.
619
+
620
+ Args:
621
+ content: Content to compress.
622
+ compression_rate: Target compression rate (0.0-1.0).
623
+ context: Optional context for relevance-aware compression.
624
+ model_name: Optional model name override.
625
+
626
+ Returns:
627
+ Compressed content string.
628
+
629
+ Example:
630
+ >>> compressed = compress_with_llmlingua(long_output, compression_rate=0.2)
631
+ """
632
+ config = LLMLinguaConfig(target_compression_rate=compression_rate)
633
+ if model_name:
634
+ config.model_name = model_name
635
+
636
+ compressor = LLMLinguaCompressor(config)
637
+ result = compressor.compress(content, context=context)
638
+ return result.compressed