headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Universal compression with ML-based content detection.
|
|
2
|
+
|
|
3
|
+
This module provides intelligent, automatic compression that:
|
|
4
|
+
1. Detects content type using ML (Magika)
|
|
5
|
+
2. Preserves structure (keys, signatures, templates)
|
|
6
|
+
3. Compresses content with LLMLingua
|
|
7
|
+
4. Enables retrieval via CCR
|
|
8
|
+
|
|
9
|
+
Quick Start:
|
|
10
|
+
# One-liner for simple use
|
|
11
|
+
from headroom.compression import compress
|
|
12
|
+
result = compress(content)
|
|
13
|
+
|
|
14
|
+
# Or with configuration
|
|
15
|
+
from headroom.compression import UniversalCompressor, UniversalCompressorConfig
|
|
16
|
+
|
|
17
|
+
config = UniversalCompressorConfig(compression_ratio_target=0.5)
|
|
18
|
+
compressor = UniversalCompressor(config=config)
|
|
19
|
+
result = compressor.compress(content)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from headroom.compression.detector import ContentType, MagikaDetector
|
|
23
|
+
from headroom.compression.masks import StructureMask
|
|
24
|
+
from headroom.compression.universal import (
|
|
25
|
+
CompressionResult,
|
|
26
|
+
UniversalCompressor,
|
|
27
|
+
UniversalCompressorConfig,
|
|
28
|
+
compress,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
# Simple API
|
|
33
|
+
"compress",
|
|
34
|
+
# Full API
|
|
35
|
+
"UniversalCompressor",
|
|
36
|
+
"UniversalCompressorConfig",
|
|
37
|
+
"CompressionResult",
|
|
38
|
+
# Advanced
|
|
39
|
+
"MagikaDetector",
|
|
40
|
+
"ContentType",
|
|
41
|
+
"StructureMask",
|
|
42
|
+
]
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
"""ML-based content type detection using Google's Magika.
|
|
2
|
+
|
|
3
|
+
Magika is a deep learning model for content type detection that:
|
|
4
|
+
- Runs locally (~5ms latency)
|
|
5
|
+
- Supports 100+ content types
|
|
6
|
+
- Has 99%+ accuracy on supported types
|
|
7
|
+
- Requires no configuration
|
|
8
|
+
|
|
9
|
+
This replaces rule-based detection with learned detection.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from magika import Magika
|
|
21
|
+
from magika.types import MagikaResult
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# Lazy-loaded Magika instance (singleton)
|
|
26
|
+
_magika_instance: Magika | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ContentType(Enum):
|
|
30
|
+
"""High-level content categories for compression routing."""
|
|
31
|
+
|
|
32
|
+
JSON = "json"
|
|
33
|
+
CODE = "code"
|
|
34
|
+
LOG = "log"
|
|
35
|
+
MARKDOWN = "markdown"
|
|
36
|
+
TEXT = "text"
|
|
37
|
+
UNKNOWN = "unknown"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class DetectionResult:
|
|
42
|
+
"""Result of ML-based content detection."""
|
|
43
|
+
|
|
44
|
+
content_type: ContentType
|
|
45
|
+
confidence: float # 0.0 to 1.0
|
|
46
|
+
raw_label: str # Original Magika label
|
|
47
|
+
language: str | None = None # For code: python, javascript, etc.
|
|
48
|
+
metadata: dict = field(default_factory=dict)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# Map Magika labels to our content types
|
|
52
|
+
# This is the ONLY place where we map labels - no hardcoding elsewhere
|
|
53
|
+
_CODE_LABELS = frozenset(
|
|
54
|
+
{
|
|
55
|
+
"python",
|
|
56
|
+
"javascript",
|
|
57
|
+
"typescript",
|
|
58
|
+
"go",
|
|
59
|
+
"rust",
|
|
60
|
+
"java",
|
|
61
|
+
"c",
|
|
62
|
+
"cpp",
|
|
63
|
+
"csharp",
|
|
64
|
+
"ruby",
|
|
65
|
+
"php",
|
|
66
|
+
"swift",
|
|
67
|
+
"kotlin",
|
|
68
|
+
"scala",
|
|
69
|
+
"shell",
|
|
70
|
+
"bash",
|
|
71
|
+
"powershell",
|
|
72
|
+
"sql",
|
|
73
|
+
"r",
|
|
74
|
+
"perl",
|
|
75
|
+
"lua",
|
|
76
|
+
"haskell",
|
|
77
|
+
"elixir",
|
|
78
|
+
"erlang",
|
|
79
|
+
"clojure",
|
|
80
|
+
"ocaml",
|
|
81
|
+
"fsharp",
|
|
82
|
+
"dart",
|
|
83
|
+
"julia",
|
|
84
|
+
"zig",
|
|
85
|
+
"nim",
|
|
86
|
+
"crystal",
|
|
87
|
+
"v",
|
|
88
|
+
"solidity",
|
|
89
|
+
"move",
|
|
90
|
+
"cairo",
|
|
91
|
+
"vyper",
|
|
92
|
+
}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
_STRUCTURED_LABELS = frozenset(
|
|
96
|
+
{
|
|
97
|
+
"json",
|
|
98
|
+
"jsonl",
|
|
99
|
+
"yaml",
|
|
100
|
+
"toml",
|
|
101
|
+
"xml",
|
|
102
|
+
"html",
|
|
103
|
+
"csv",
|
|
104
|
+
"tsv",
|
|
105
|
+
"ini",
|
|
106
|
+
"properties",
|
|
107
|
+
}
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
_LOG_LABELS = frozenset(
|
|
111
|
+
{
|
|
112
|
+
"log",
|
|
113
|
+
"syslog",
|
|
114
|
+
}
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
_MARKDOWN_LABELS = frozenset(
|
|
118
|
+
{
|
|
119
|
+
"markdown",
|
|
120
|
+
"rst",
|
|
121
|
+
"asciidoc",
|
|
122
|
+
"org",
|
|
123
|
+
}
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _get_magika() -> Magika:
|
|
128
|
+
"""Get or create the singleton Magika instance.
|
|
129
|
+
|
|
130
|
+
Lazy-loads on first use to avoid import cost if not needed.
|
|
131
|
+
"""
|
|
132
|
+
global _magika_instance
|
|
133
|
+
if _magika_instance is None:
|
|
134
|
+
try:
|
|
135
|
+
from magika import Magika
|
|
136
|
+
|
|
137
|
+
_magika_instance = Magika()
|
|
138
|
+
logger.debug("Magika model loaded successfully")
|
|
139
|
+
except ImportError as e:
|
|
140
|
+
raise ImportError(
|
|
141
|
+
"Magika is required for ML-based content detection. "
|
|
142
|
+
"Install with: pip install magika"
|
|
143
|
+
) from e
|
|
144
|
+
return _magika_instance
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _magika_available() -> bool:
|
|
148
|
+
"""Check if Magika is available without loading it."""
|
|
149
|
+
try:
|
|
150
|
+
import magika # noqa: F401
|
|
151
|
+
|
|
152
|
+
return True
|
|
153
|
+
except ImportError:
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class MagikaDetector:
|
|
158
|
+
"""ML-based content type detector using Google's Magika.
|
|
159
|
+
|
|
160
|
+
This detector uses a deep learning model to identify content types
|
|
161
|
+
without relying on file extensions or brittle regex patterns.
|
|
162
|
+
|
|
163
|
+
Example:
|
|
164
|
+
detector = MagikaDetector()
|
|
165
|
+
result = detector.detect('def hello(): print("hi")')
|
|
166
|
+
# result.content_type == ContentType.CODE
|
|
167
|
+
# result.language == "python"
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
def __init__(self, min_confidence: float = 0.5):
|
|
171
|
+
"""Initialize the detector.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
min_confidence: Minimum confidence threshold. Below this,
|
|
175
|
+
returns ContentType.UNKNOWN.
|
|
176
|
+
"""
|
|
177
|
+
self.min_confidence = min_confidence
|
|
178
|
+
self._magika: Magika | None = None
|
|
179
|
+
|
|
180
|
+
def _ensure_magika(self) -> Magika:
|
|
181
|
+
"""Ensure Magika is loaded."""
|
|
182
|
+
if self._magika is None:
|
|
183
|
+
self._magika = _get_magika()
|
|
184
|
+
return self._magika
|
|
185
|
+
|
|
186
|
+
def detect(self, content: str) -> DetectionResult:
|
|
187
|
+
"""Detect content type using ML.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
content: The content to analyze.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
DetectionResult with type, confidence, and metadata.
|
|
194
|
+
|
|
195
|
+
Example:
|
|
196
|
+
>>> detector = MagikaDetector()
|
|
197
|
+
>>> result = detector.detect('{"users": [{"id": 1}]}')
|
|
198
|
+
>>> result.content_type
|
|
199
|
+
ContentType.JSON
|
|
200
|
+
"""
|
|
201
|
+
if not content or not content.strip():
|
|
202
|
+
return DetectionResult(
|
|
203
|
+
content_type=ContentType.UNKNOWN,
|
|
204
|
+
confidence=0.0,
|
|
205
|
+
raw_label="empty",
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Get Magika prediction
|
|
209
|
+
magika = self._ensure_magika()
|
|
210
|
+
result: MagikaResult = magika.identify_bytes(content.encode("utf-8"))
|
|
211
|
+
|
|
212
|
+
raw_label = result.output.ct_label
|
|
213
|
+
confidence = result.output.score
|
|
214
|
+
|
|
215
|
+
# Map to our content type
|
|
216
|
+
content_type, language = self._map_label(raw_label)
|
|
217
|
+
|
|
218
|
+
# Apply confidence threshold
|
|
219
|
+
if confidence < self.min_confidence:
|
|
220
|
+
content_type = ContentType.UNKNOWN
|
|
221
|
+
|
|
222
|
+
return DetectionResult(
|
|
223
|
+
content_type=content_type,
|
|
224
|
+
confidence=confidence,
|
|
225
|
+
raw_label=raw_label,
|
|
226
|
+
language=language,
|
|
227
|
+
metadata={
|
|
228
|
+
"magika_group": result.output.group,
|
|
229
|
+
"magika_mime": result.output.mime_type,
|
|
230
|
+
},
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
def detect_batch(self, contents: list[str]) -> list[DetectionResult]:
|
|
234
|
+
"""Detect content types for multiple contents.
|
|
235
|
+
|
|
236
|
+
More efficient than calling detect() in a loop.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
contents: List of content strings to analyze.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
List of DetectionResults in same order as input.
|
|
243
|
+
"""
|
|
244
|
+
if not contents:
|
|
245
|
+
return []
|
|
246
|
+
|
|
247
|
+
magika = self._ensure_magika()
|
|
248
|
+
results = []
|
|
249
|
+
|
|
250
|
+
# Convert to bytes for Magika
|
|
251
|
+
byte_contents = [c.encode("utf-8") for c in contents]
|
|
252
|
+
|
|
253
|
+
# Batch detection
|
|
254
|
+
magika_results = magika.identify_bytes_batch(byte_contents)
|
|
255
|
+
|
|
256
|
+
for content, magika_result in zip(contents, magika_results):
|
|
257
|
+
if not content or not content.strip():
|
|
258
|
+
results.append(
|
|
259
|
+
DetectionResult(
|
|
260
|
+
content_type=ContentType.UNKNOWN,
|
|
261
|
+
confidence=0.0,
|
|
262
|
+
raw_label="empty",
|
|
263
|
+
)
|
|
264
|
+
)
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
raw_label = magika_result.output.ct_label
|
|
268
|
+
confidence = magika_result.output.score
|
|
269
|
+
content_type, language = self._map_label(raw_label)
|
|
270
|
+
|
|
271
|
+
if confidence < self.min_confidence:
|
|
272
|
+
content_type = ContentType.UNKNOWN
|
|
273
|
+
|
|
274
|
+
results.append(
|
|
275
|
+
DetectionResult(
|
|
276
|
+
content_type=content_type,
|
|
277
|
+
confidence=confidence,
|
|
278
|
+
raw_label=raw_label,
|
|
279
|
+
language=language,
|
|
280
|
+
metadata={
|
|
281
|
+
"magika_group": magika_result.output.group,
|
|
282
|
+
"magika_mime": magika_result.output.mime_type,
|
|
283
|
+
},
|
|
284
|
+
)
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
return results
|
|
288
|
+
|
|
289
|
+
def _map_label(self, label: str) -> tuple[ContentType, str | None]:
|
|
290
|
+
"""Map Magika label to our ContentType.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
label: Raw Magika label (e.g., "python", "json").
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Tuple of (ContentType, optional language).
|
|
297
|
+
"""
|
|
298
|
+
label_lower = label.lower()
|
|
299
|
+
|
|
300
|
+
# Check code languages
|
|
301
|
+
if label_lower in _CODE_LABELS:
|
|
302
|
+
return ContentType.CODE, label_lower
|
|
303
|
+
|
|
304
|
+
# Check structured data
|
|
305
|
+
if label_lower in _STRUCTURED_LABELS:
|
|
306
|
+
# JSON gets its own type for specialized handling
|
|
307
|
+
if label_lower in ("json", "jsonl"):
|
|
308
|
+
return ContentType.JSON, None
|
|
309
|
+
# Other structured data treated as JSON-like
|
|
310
|
+
return ContentType.JSON, None
|
|
311
|
+
|
|
312
|
+
# Check logs
|
|
313
|
+
if label_lower in _LOG_LABELS:
|
|
314
|
+
return ContentType.LOG, None
|
|
315
|
+
|
|
316
|
+
# Check markdown/docs
|
|
317
|
+
if label_lower in _MARKDOWN_LABELS:
|
|
318
|
+
return ContentType.MARKDOWN, None
|
|
319
|
+
|
|
320
|
+
# Text types
|
|
321
|
+
if label_lower in ("txt", "text", "ascii", "utf8", "empty"):
|
|
322
|
+
return ContentType.TEXT, None
|
|
323
|
+
|
|
324
|
+
# Default: treat as text
|
|
325
|
+
return ContentType.TEXT, None
|
|
326
|
+
|
|
327
|
+
@staticmethod
|
|
328
|
+
def is_available() -> bool:
|
|
329
|
+
"""Check if Magika is available."""
|
|
330
|
+
return _magika_available()
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class FallbackDetector:
|
|
334
|
+
"""Simple fallback detector when Magika is not available.
|
|
335
|
+
|
|
336
|
+
Uses basic heuristics - not as accurate but requires no dependencies.
|
|
337
|
+
"""
|
|
338
|
+
|
|
339
|
+
def __init__(self, min_confidence: float = 0.5):
|
|
340
|
+
"""Initialize the fallback detector."""
|
|
341
|
+
self.min_confidence = min_confidence
|
|
342
|
+
|
|
343
|
+
def detect(self, content: str) -> DetectionResult:
|
|
344
|
+
"""Detect content type using simple heuristics.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
content: The content to analyze.
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
DetectionResult with type and confidence.
|
|
351
|
+
"""
|
|
352
|
+
if not content or not content.strip():
|
|
353
|
+
return DetectionResult(
|
|
354
|
+
content_type=ContentType.UNKNOWN,
|
|
355
|
+
confidence=0.0,
|
|
356
|
+
raw_label="empty",
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
stripped = content.strip()
|
|
360
|
+
|
|
361
|
+
# JSON detection (simple but effective)
|
|
362
|
+
if stripped.startswith(("{", "[")):
|
|
363
|
+
try:
|
|
364
|
+
import json
|
|
365
|
+
|
|
366
|
+
json.loads(stripped)
|
|
367
|
+
return DetectionResult(
|
|
368
|
+
content_type=ContentType.JSON,
|
|
369
|
+
confidence=1.0,
|
|
370
|
+
raw_label="json",
|
|
371
|
+
)
|
|
372
|
+
except (json.JSONDecodeError, ValueError):
|
|
373
|
+
pass
|
|
374
|
+
|
|
375
|
+
# Code detection (look for common patterns)
|
|
376
|
+
code_indicators = [
|
|
377
|
+
"def ",
|
|
378
|
+
"class ",
|
|
379
|
+
"function ",
|
|
380
|
+
"import ",
|
|
381
|
+
"const ",
|
|
382
|
+
"let ",
|
|
383
|
+
"var ",
|
|
384
|
+
"func ",
|
|
385
|
+
"fn ",
|
|
386
|
+
"pub ",
|
|
387
|
+
"package ",
|
|
388
|
+
]
|
|
389
|
+
if any(indicator in content for indicator in code_indicators):
|
|
390
|
+
return DetectionResult(
|
|
391
|
+
content_type=ContentType.CODE,
|
|
392
|
+
confidence=0.7,
|
|
393
|
+
raw_label="code",
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Log detection
|
|
397
|
+
log_indicators = ["ERROR", "WARN", "INFO", "DEBUG", "FATAL"]
|
|
398
|
+
if any(indicator in content for indicator in log_indicators):
|
|
399
|
+
return DetectionResult(
|
|
400
|
+
content_type=ContentType.LOG,
|
|
401
|
+
confidence=0.6,
|
|
402
|
+
raw_label="log",
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Default to text
|
|
406
|
+
return DetectionResult(
|
|
407
|
+
content_type=ContentType.TEXT,
|
|
408
|
+
confidence=0.5,
|
|
409
|
+
raw_label="text",
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def get_detector(prefer_magika: bool = True) -> MagikaDetector | FallbackDetector:
|
|
414
|
+
"""Get the best available detector.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
prefer_magika: If True, use Magika if available.
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
MagikaDetector if available and preferred, else FallbackDetector.
|
|
421
|
+
"""
|
|
422
|
+
if prefer_magika and MagikaDetector.is_available():
|
|
423
|
+
return MagikaDetector()
|
|
424
|
+
return FallbackDetector()
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Structure handlers for different content types.
|
|
2
|
+
|
|
3
|
+
Each handler knows how to extract structural information from a specific
|
|
4
|
+
content type and create a StructureMask marking what should be preserved.
|
|
5
|
+
|
|
6
|
+
Handlers don't compress - they only identify structure. The actual
|
|
7
|
+
compression is done by LLMLingua on the non-structural parts.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from headroom.compression.handlers.base import (
|
|
11
|
+
HandlerResult,
|
|
12
|
+
StructureHandler,
|
|
13
|
+
)
|
|
14
|
+
from headroom.compression.handlers.code_handler import CodeStructureHandler
|
|
15
|
+
from headroom.compression.handlers.json_handler import JSONStructureHandler
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"StructureHandler",
|
|
19
|
+
"HandlerResult",
|
|
20
|
+
"JSONStructureHandler",
|
|
21
|
+
"CodeStructureHandler",
|
|
22
|
+
]
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Base class and protocol for structure handlers.
|
|
2
|
+
|
|
3
|
+
Structure handlers extract structural information from content and create
|
|
4
|
+
masks identifying what should be preserved during compression.
|
|
5
|
+
|
|
6
|
+
The handler protocol is simple:
|
|
7
|
+
1. get_mask(content) -> StructureMask
|
|
8
|
+
2. can_handle(content) -> bool (optional)
|
|
9
|
+
|
|
10
|
+
Handlers are content-type specific but domain-agnostic. A JSONStructureHandler
|
|
11
|
+
preserves JSON keys whether it's user data, search results, or config files.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any, Protocol, runtime_checkable
|
|
19
|
+
|
|
20
|
+
from headroom.compression.masks import StructureMask
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class HandlerResult:
|
|
25
|
+
"""Result from a structure handler.
|
|
26
|
+
|
|
27
|
+
Contains the mask plus metadata about what was detected.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
mask: StructureMask
|
|
31
|
+
handler_name: str
|
|
32
|
+
confidence: float = 1.0 # How confident the handler is in its detection
|
|
33
|
+
metadata: dict = field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def preservation_ratio(self) -> float:
|
|
37
|
+
"""Fraction of content marked for preservation."""
|
|
38
|
+
return self.mask.preservation_ratio
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@runtime_checkable
|
|
42
|
+
class StructureHandler(Protocol):
|
|
43
|
+
"""Protocol for structure handlers.
|
|
44
|
+
|
|
45
|
+
Any class implementing get_mask() can be used as a handler.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def name(self) -> str:
|
|
50
|
+
"""Handler name for logging and metadata."""
|
|
51
|
+
...
|
|
52
|
+
|
|
53
|
+
def get_mask(
|
|
54
|
+
self,
|
|
55
|
+
content: str,
|
|
56
|
+
tokens: list[str] | None = None,
|
|
57
|
+
**kwargs: Any,
|
|
58
|
+
) -> HandlerResult:
|
|
59
|
+
"""Extract structure mask from content.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
content: The content to analyze.
|
|
63
|
+
tokens: Pre-tokenized content (optional). If not provided,
|
|
64
|
+
handler should tokenize internally.
|
|
65
|
+
**kwargs: Handler-specific options.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
HandlerResult with mask and metadata.
|
|
69
|
+
"""
|
|
70
|
+
...
|
|
71
|
+
|
|
72
|
+
def can_handle(self, content: str) -> bool:
|
|
73
|
+
"""Check if this handler can process the content.
|
|
74
|
+
|
|
75
|
+
Default implementation returns True. Override for handlers
|
|
76
|
+
that need to verify content format before processing.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
content: The content to check.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
True if handler can process this content.
|
|
83
|
+
"""
|
|
84
|
+
...
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class BaseStructureHandler(ABC):
|
|
88
|
+
"""Base implementation for structure handlers.
|
|
89
|
+
|
|
90
|
+
Provides common functionality and enforces the handler interface.
|
|
91
|
+
Subclasses must implement _extract_mask().
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self, name: str | None = None):
|
|
95
|
+
"""Initialize the handler.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
name: Optional handler name. Defaults to class name.
|
|
99
|
+
"""
|
|
100
|
+
self._name = name or self.__class__.__name__
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def name(self) -> str:
|
|
104
|
+
"""Handler name."""
|
|
105
|
+
return self._name
|
|
106
|
+
|
|
107
|
+
def get_mask(
|
|
108
|
+
self,
|
|
109
|
+
content: str,
|
|
110
|
+
tokens: list[str] | None = None,
|
|
111
|
+
**kwargs: Any,
|
|
112
|
+
) -> HandlerResult:
|
|
113
|
+
"""Extract structure mask from content.
|
|
114
|
+
|
|
115
|
+
This is the main entry point. It handles common logic like
|
|
116
|
+
empty content and delegates to _extract_mask() for the
|
|
117
|
+
content-specific logic.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
content: The content to analyze.
|
|
121
|
+
tokens: Pre-tokenized content (optional).
|
|
122
|
+
**kwargs: Handler-specific options.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
HandlerResult with mask and metadata.
|
|
126
|
+
"""
|
|
127
|
+
# Handle empty content
|
|
128
|
+
if not content or not content.strip():
|
|
129
|
+
tokens = tokens or []
|
|
130
|
+
return HandlerResult(
|
|
131
|
+
mask=StructureMask.empty(tokens),
|
|
132
|
+
handler_name=self.name,
|
|
133
|
+
confidence=0.0,
|
|
134
|
+
metadata={"empty": True},
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Tokenize if not provided
|
|
138
|
+
if tokens is None:
|
|
139
|
+
tokens = self._tokenize(content)
|
|
140
|
+
|
|
141
|
+
# Delegate to subclass
|
|
142
|
+
return self._extract_mask(content, tokens, **kwargs)
|
|
143
|
+
|
|
144
|
+
def can_handle(self, content: str) -> bool:
|
|
145
|
+
"""Check if this handler can process the content.
|
|
146
|
+
|
|
147
|
+
Default implementation returns True. Override for handlers
|
|
148
|
+
that need to verify content format.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
content: The content to check.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
True if handler can process this content.
|
|
155
|
+
"""
|
|
156
|
+
return True
|
|
157
|
+
|
|
158
|
+
@abstractmethod
|
|
159
|
+
def _extract_mask(
|
|
160
|
+
self,
|
|
161
|
+
content: str,
|
|
162
|
+
tokens: list[str],
|
|
163
|
+
**kwargs: Any,
|
|
164
|
+
) -> HandlerResult:
|
|
165
|
+
"""Extract structure mask from content.
|
|
166
|
+
|
|
167
|
+
Subclasses implement this to provide content-specific logic.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
content: The content to analyze (non-empty, stripped).
|
|
171
|
+
tokens: Tokenized content.
|
|
172
|
+
**kwargs: Handler-specific options.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
HandlerResult with mask and metadata.
|
|
176
|
+
"""
|
|
177
|
+
...
|
|
178
|
+
|
|
179
|
+
def _tokenize(self, content: str) -> list[str]:
|
|
180
|
+
"""Default tokenization - character-level.
|
|
181
|
+
|
|
182
|
+
Subclasses may override for more sophisticated tokenization.
|
|
183
|
+
For mask purposes, character-level is often sufficient and
|
|
184
|
+
aligns well with LLMLingua's token-level compression.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
content: Content to tokenize.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
List of tokens (characters by default).
|
|
191
|
+
"""
|
|
192
|
+
# Simple character-level tokenization
|
|
193
|
+
# This aligns well with structure detection (we mark ranges)
|
|
194
|
+
return list(content)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class NoOpHandler(BaseStructureHandler):
|
|
198
|
+
"""Handler that marks everything as compressible.
|
|
199
|
+
|
|
200
|
+
Used as a fallback when no structure is detected.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(self) -> None:
|
|
204
|
+
"""Initialize the no-op handler."""
|
|
205
|
+
super().__init__(name="noop")
|
|
206
|
+
|
|
207
|
+
def _extract_mask(
|
|
208
|
+
self,
|
|
209
|
+
content: str,
|
|
210
|
+
tokens: list[str],
|
|
211
|
+
**kwargs: Any,
|
|
212
|
+
) -> HandlerResult:
|
|
213
|
+
"""Return mask with everything compressible."""
|
|
214
|
+
return HandlerResult(
|
|
215
|
+
mask=StructureMask.empty(tokens),
|
|
216
|
+
handler_name=self.name,
|
|
217
|
+
confidence=1.0,
|
|
218
|
+
metadata={"reason": "no structure detected"},
|
|
219
|
+
)
|