headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,42 @@
1
+ """Universal compression with ML-based content detection.
2
+
3
+ This module provides intelligent, automatic compression that:
4
+ 1. Detects content type using ML (Magika)
5
+ 2. Preserves structure (keys, signatures, templates)
6
+ 3. Compresses content with LLMLingua
7
+ 4. Enables retrieval via CCR
8
+
9
+ Quick Start:
10
+ # One-liner for simple use
11
+ from headroom.compression import compress
12
+ result = compress(content)
13
+
14
+ # Or with configuration
15
+ from headroom.compression import UniversalCompressor, UniversalCompressorConfig
16
+
17
+ config = UniversalCompressorConfig(compression_ratio_target=0.5)
18
+ compressor = UniversalCompressor(config=config)
19
+ result = compressor.compress(content)
20
+ """
21
+
22
+ from headroom.compression.detector import ContentType, MagikaDetector
23
+ from headroom.compression.masks import StructureMask
24
+ from headroom.compression.universal import (
25
+ CompressionResult,
26
+ UniversalCompressor,
27
+ UniversalCompressorConfig,
28
+ compress,
29
+ )
30
+
31
+ __all__ = [
32
+ # Simple API
33
+ "compress",
34
+ # Full API
35
+ "UniversalCompressor",
36
+ "UniversalCompressorConfig",
37
+ "CompressionResult",
38
+ # Advanced
39
+ "MagikaDetector",
40
+ "ContentType",
41
+ "StructureMask",
42
+ ]
@@ -0,0 +1,424 @@
1
+ """ML-based content type detection using Google's Magika.
2
+
3
+ Magika is a deep learning model for content type detection that:
4
+ - Runs locally (~5ms latency)
5
+ - Supports 100+ content types
6
+ - Has 99%+ accuracy on supported types
7
+ - Requires no configuration
8
+
9
+ This replaces rule-based detection with learned detection.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ from dataclasses import dataclass, field
16
+ from enum import Enum
17
+ from typing import TYPE_CHECKING
18
+
19
+ if TYPE_CHECKING:
20
+ from magika import Magika
21
+ from magika.types import MagikaResult
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Lazy-loaded Magika instance (singleton)
26
+ _magika_instance: Magika | None = None
27
+
28
+
29
+ class ContentType(Enum):
30
+ """High-level content categories for compression routing."""
31
+
32
+ JSON = "json"
33
+ CODE = "code"
34
+ LOG = "log"
35
+ MARKDOWN = "markdown"
36
+ TEXT = "text"
37
+ UNKNOWN = "unknown"
38
+
39
+
40
+ @dataclass
41
+ class DetectionResult:
42
+ """Result of ML-based content detection."""
43
+
44
+ content_type: ContentType
45
+ confidence: float # 0.0 to 1.0
46
+ raw_label: str # Original Magika label
47
+ language: str | None = None # For code: python, javascript, etc.
48
+ metadata: dict = field(default_factory=dict)
49
+
50
+
51
+ # Map Magika labels to our content types
52
+ # This is the ONLY place where we map labels - no hardcoding elsewhere
53
+ _CODE_LABELS = frozenset(
54
+ {
55
+ "python",
56
+ "javascript",
57
+ "typescript",
58
+ "go",
59
+ "rust",
60
+ "java",
61
+ "c",
62
+ "cpp",
63
+ "csharp",
64
+ "ruby",
65
+ "php",
66
+ "swift",
67
+ "kotlin",
68
+ "scala",
69
+ "shell",
70
+ "bash",
71
+ "powershell",
72
+ "sql",
73
+ "r",
74
+ "perl",
75
+ "lua",
76
+ "haskell",
77
+ "elixir",
78
+ "erlang",
79
+ "clojure",
80
+ "ocaml",
81
+ "fsharp",
82
+ "dart",
83
+ "julia",
84
+ "zig",
85
+ "nim",
86
+ "crystal",
87
+ "v",
88
+ "solidity",
89
+ "move",
90
+ "cairo",
91
+ "vyper",
92
+ }
93
+ )
94
+
95
+ _STRUCTURED_LABELS = frozenset(
96
+ {
97
+ "json",
98
+ "jsonl",
99
+ "yaml",
100
+ "toml",
101
+ "xml",
102
+ "html",
103
+ "csv",
104
+ "tsv",
105
+ "ini",
106
+ "properties",
107
+ }
108
+ )
109
+
110
+ _LOG_LABELS = frozenset(
111
+ {
112
+ "log",
113
+ "syslog",
114
+ }
115
+ )
116
+
117
+ _MARKDOWN_LABELS = frozenset(
118
+ {
119
+ "markdown",
120
+ "rst",
121
+ "asciidoc",
122
+ "org",
123
+ }
124
+ )
125
+
126
+
127
+ def _get_magika() -> Magika:
128
+ """Get or create the singleton Magika instance.
129
+
130
+ Lazy-loads on first use to avoid import cost if not needed.
131
+ """
132
+ global _magika_instance
133
+ if _magika_instance is None:
134
+ try:
135
+ from magika import Magika
136
+
137
+ _magika_instance = Magika()
138
+ logger.debug("Magika model loaded successfully")
139
+ except ImportError as e:
140
+ raise ImportError(
141
+ "Magika is required for ML-based content detection. "
142
+ "Install with: pip install magika"
143
+ ) from e
144
+ return _magika_instance
145
+
146
+
147
+ def _magika_available() -> bool:
148
+ """Check if Magika is available without loading it."""
149
+ try:
150
+ import magika # noqa: F401
151
+
152
+ return True
153
+ except ImportError:
154
+ return False
155
+
156
+
157
+ class MagikaDetector:
158
+ """ML-based content type detector using Google's Magika.
159
+
160
+ This detector uses a deep learning model to identify content types
161
+ without relying on file extensions or brittle regex patterns.
162
+
163
+ Example:
164
+ detector = MagikaDetector()
165
+ result = detector.detect('def hello(): print("hi")')
166
+ # result.content_type == ContentType.CODE
167
+ # result.language == "python"
168
+ """
169
+
170
+ def __init__(self, min_confidence: float = 0.5):
171
+ """Initialize the detector.
172
+
173
+ Args:
174
+ min_confidence: Minimum confidence threshold. Below this,
175
+ returns ContentType.UNKNOWN.
176
+ """
177
+ self.min_confidence = min_confidence
178
+ self._magika: Magika | None = None
179
+
180
+ def _ensure_magika(self) -> Magika:
181
+ """Ensure Magika is loaded."""
182
+ if self._magika is None:
183
+ self._magika = _get_magika()
184
+ return self._magika
185
+
186
+ def detect(self, content: str) -> DetectionResult:
187
+ """Detect content type using ML.
188
+
189
+ Args:
190
+ content: The content to analyze.
191
+
192
+ Returns:
193
+ DetectionResult with type, confidence, and metadata.
194
+
195
+ Example:
196
+ >>> detector = MagikaDetector()
197
+ >>> result = detector.detect('{"users": [{"id": 1}]}')
198
+ >>> result.content_type
199
+ ContentType.JSON
200
+ """
201
+ if not content or not content.strip():
202
+ return DetectionResult(
203
+ content_type=ContentType.UNKNOWN,
204
+ confidence=0.0,
205
+ raw_label="empty",
206
+ )
207
+
208
+ # Get Magika prediction
209
+ magika = self._ensure_magika()
210
+ result: MagikaResult = magika.identify_bytes(content.encode("utf-8"))
211
+
212
+ raw_label = result.output.ct_label
213
+ confidence = result.output.score
214
+
215
+ # Map to our content type
216
+ content_type, language = self._map_label(raw_label)
217
+
218
+ # Apply confidence threshold
219
+ if confidence < self.min_confidence:
220
+ content_type = ContentType.UNKNOWN
221
+
222
+ return DetectionResult(
223
+ content_type=content_type,
224
+ confidence=confidence,
225
+ raw_label=raw_label,
226
+ language=language,
227
+ metadata={
228
+ "magika_group": result.output.group,
229
+ "magika_mime": result.output.mime_type,
230
+ },
231
+ )
232
+
233
+ def detect_batch(self, contents: list[str]) -> list[DetectionResult]:
234
+ """Detect content types for multiple contents.
235
+
236
+ More efficient than calling detect() in a loop.
237
+
238
+ Args:
239
+ contents: List of content strings to analyze.
240
+
241
+ Returns:
242
+ List of DetectionResults in same order as input.
243
+ """
244
+ if not contents:
245
+ return []
246
+
247
+ magika = self._ensure_magika()
248
+ results = []
249
+
250
+ # Convert to bytes for Magika
251
+ byte_contents = [c.encode("utf-8") for c in contents]
252
+
253
+ # Batch detection
254
+ magika_results = magika.identify_bytes_batch(byte_contents)
255
+
256
+ for content, magika_result in zip(contents, magika_results):
257
+ if not content or not content.strip():
258
+ results.append(
259
+ DetectionResult(
260
+ content_type=ContentType.UNKNOWN,
261
+ confidence=0.0,
262
+ raw_label="empty",
263
+ )
264
+ )
265
+ continue
266
+
267
+ raw_label = magika_result.output.ct_label
268
+ confidence = magika_result.output.score
269
+ content_type, language = self._map_label(raw_label)
270
+
271
+ if confidence < self.min_confidence:
272
+ content_type = ContentType.UNKNOWN
273
+
274
+ results.append(
275
+ DetectionResult(
276
+ content_type=content_type,
277
+ confidence=confidence,
278
+ raw_label=raw_label,
279
+ language=language,
280
+ metadata={
281
+ "magika_group": magika_result.output.group,
282
+ "magika_mime": magika_result.output.mime_type,
283
+ },
284
+ )
285
+ )
286
+
287
+ return results
288
+
289
+ def _map_label(self, label: str) -> tuple[ContentType, str | None]:
290
+ """Map Magika label to our ContentType.
291
+
292
+ Args:
293
+ label: Raw Magika label (e.g., "python", "json").
294
+
295
+ Returns:
296
+ Tuple of (ContentType, optional language).
297
+ """
298
+ label_lower = label.lower()
299
+
300
+ # Check code languages
301
+ if label_lower in _CODE_LABELS:
302
+ return ContentType.CODE, label_lower
303
+
304
+ # Check structured data
305
+ if label_lower in _STRUCTURED_LABELS:
306
+ # JSON gets its own type for specialized handling
307
+ if label_lower in ("json", "jsonl"):
308
+ return ContentType.JSON, None
309
+ # Other structured data treated as JSON-like
310
+ return ContentType.JSON, None
311
+
312
+ # Check logs
313
+ if label_lower in _LOG_LABELS:
314
+ return ContentType.LOG, None
315
+
316
+ # Check markdown/docs
317
+ if label_lower in _MARKDOWN_LABELS:
318
+ return ContentType.MARKDOWN, None
319
+
320
+ # Text types
321
+ if label_lower in ("txt", "text", "ascii", "utf8", "empty"):
322
+ return ContentType.TEXT, None
323
+
324
+ # Default: treat as text
325
+ return ContentType.TEXT, None
326
+
327
+ @staticmethod
328
+ def is_available() -> bool:
329
+ """Check if Magika is available."""
330
+ return _magika_available()
331
+
332
+
333
+ class FallbackDetector:
334
+ """Simple fallback detector when Magika is not available.
335
+
336
+ Uses basic heuristics - not as accurate but requires no dependencies.
337
+ """
338
+
339
+ def __init__(self, min_confidence: float = 0.5):
340
+ """Initialize the fallback detector."""
341
+ self.min_confidence = min_confidence
342
+
343
+ def detect(self, content: str) -> DetectionResult:
344
+ """Detect content type using simple heuristics.
345
+
346
+ Args:
347
+ content: The content to analyze.
348
+
349
+ Returns:
350
+ DetectionResult with type and confidence.
351
+ """
352
+ if not content or not content.strip():
353
+ return DetectionResult(
354
+ content_type=ContentType.UNKNOWN,
355
+ confidence=0.0,
356
+ raw_label="empty",
357
+ )
358
+
359
+ stripped = content.strip()
360
+
361
+ # JSON detection (simple but effective)
362
+ if stripped.startswith(("{", "[")):
363
+ try:
364
+ import json
365
+
366
+ json.loads(stripped)
367
+ return DetectionResult(
368
+ content_type=ContentType.JSON,
369
+ confidence=1.0,
370
+ raw_label="json",
371
+ )
372
+ except (json.JSONDecodeError, ValueError):
373
+ pass
374
+
375
+ # Code detection (look for common patterns)
376
+ code_indicators = [
377
+ "def ",
378
+ "class ",
379
+ "function ",
380
+ "import ",
381
+ "const ",
382
+ "let ",
383
+ "var ",
384
+ "func ",
385
+ "fn ",
386
+ "pub ",
387
+ "package ",
388
+ ]
389
+ if any(indicator in content for indicator in code_indicators):
390
+ return DetectionResult(
391
+ content_type=ContentType.CODE,
392
+ confidence=0.7,
393
+ raw_label="code",
394
+ )
395
+
396
+ # Log detection
397
+ log_indicators = ["ERROR", "WARN", "INFO", "DEBUG", "FATAL"]
398
+ if any(indicator in content for indicator in log_indicators):
399
+ return DetectionResult(
400
+ content_type=ContentType.LOG,
401
+ confidence=0.6,
402
+ raw_label="log",
403
+ )
404
+
405
+ # Default to text
406
+ return DetectionResult(
407
+ content_type=ContentType.TEXT,
408
+ confidence=0.5,
409
+ raw_label="text",
410
+ )
411
+
412
+
413
+ def get_detector(prefer_magika: bool = True) -> MagikaDetector | FallbackDetector:
414
+ """Get the best available detector.
415
+
416
+ Args:
417
+ prefer_magika: If True, use Magika if available.
418
+
419
+ Returns:
420
+ MagikaDetector if available and preferred, else FallbackDetector.
421
+ """
422
+ if prefer_magika and MagikaDetector.is_available():
423
+ return MagikaDetector()
424
+ return FallbackDetector()
@@ -0,0 +1,22 @@
1
+ """Structure handlers for different content types.
2
+
3
+ Each handler knows how to extract structural information from a specific
4
+ content type and create a StructureMask marking what should be preserved.
5
+
6
+ Handlers don't compress - they only identify structure. The actual
7
+ compression is done by LLMLingua on the non-structural parts.
8
+ """
9
+
10
+ from headroom.compression.handlers.base import (
11
+ HandlerResult,
12
+ StructureHandler,
13
+ )
14
+ from headroom.compression.handlers.code_handler import CodeStructureHandler
15
+ from headroom.compression.handlers.json_handler import JSONStructureHandler
16
+
17
+ __all__ = [
18
+ "StructureHandler",
19
+ "HandlerResult",
20
+ "JSONStructureHandler",
21
+ "CodeStructureHandler",
22
+ ]
@@ -0,0 +1,219 @@
1
+ """Base class and protocol for structure handlers.
2
+
3
+ Structure handlers extract structural information from content and create
4
+ masks identifying what should be preserved during compression.
5
+
6
+ The handler protocol is simple:
7
+ 1. get_mask(content) -> StructureMask
8
+ 2. can_handle(content) -> bool (optional)
9
+
10
+ Handlers are content-type specific but domain-agnostic. A JSONStructureHandler
11
+ preserves JSON keys whether it's user data, search results, or config files.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from dataclasses import dataclass, field
18
+ from typing import Any, Protocol, runtime_checkable
19
+
20
+ from headroom.compression.masks import StructureMask
21
+
22
+
23
+ @dataclass
24
+ class HandlerResult:
25
+ """Result from a structure handler.
26
+
27
+ Contains the mask plus metadata about what was detected.
28
+ """
29
+
30
+ mask: StructureMask
31
+ handler_name: str
32
+ confidence: float = 1.0 # How confident the handler is in its detection
33
+ metadata: dict = field(default_factory=dict)
34
+
35
+ @property
36
+ def preservation_ratio(self) -> float:
37
+ """Fraction of content marked for preservation."""
38
+ return self.mask.preservation_ratio
39
+
40
+
41
+ @runtime_checkable
42
+ class StructureHandler(Protocol):
43
+ """Protocol for structure handlers.
44
+
45
+ Any class implementing get_mask() can be used as a handler.
46
+ """
47
+
48
+ @property
49
+ def name(self) -> str:
50
+ """Handler name for logging and metadata."""
51
+ ...
52
+
53
+ def get_mask(
54
+ self,
55
+ content: str,
56
+ tokens: list[str] | None = None,
57
+ **kwargs: Any,
58
+ ) -> HandlerResult:
59
+ """Extract structure mask from content.
60
+
61
+ Args:
62
+ content: The content to analyze.
63
+ tokens: Pre-tokenized content (optional). If not provided,
64
+ handler should tokenize internally.
65
+ **kwargs: Handler-specific options.
66
+
67
+ Returns:
68
+ HandlerResult with mask and metadata.
69
+ """
70
+ ...
71
+
72
+ def can_handle(self, content: str) -> bool:
73
+ """Check if this handler can process the content.
74
+
75
+ Default implementation returns True. Override for handlers
76
+ that need to verify content format before processing.
77
+
78
+ Args:
79
+ content: The content to check.
80
+
81
+ Returns:
82
+ True if handler can process this content.
83
+ """
84
+ ...
85
+
86
+
87
+ class BaseStructureHandler(ABC):
88
+ """Base implementation for structure handlers.
89
+
90
+ Provides common functionality and enforces the handler interface.
91
+ Subclasses must implement _extract_mask().
92
+ """
93
+
94
+ def __init__(self, name: str | None = None):
95
+ """Initialize the handler.
96
+
97
+ Args:
98
+ name: Optional handler name. Defaults to class name.
99
+ """
100
+ self._name = name or self.__class__.__name__
101
+
102
+ @property
103
+ def name(self) -> str:
104
+ """Handler name."""
105
+ return self._name
106
+
107
+ def get_mask(
108
+ self,
109
+ content: str,
110
+ tokens: list[str] | None = None,
111
+ **kwargs: Any,
112
+ ) -> HandlerResult:
113
+ """Extract structure mask from content.
114
+
115
+ This is the main entry point. It handles common logic like
116
+ empty content and delegates to _extract_mask() for the
117
+ content-specific logic.
118
+
119
+ Args:
120
+ content: The content to analyze.
121
+ tokens: Pre-tokenized content (optional).
122
+ **kwargs: Handler-specific options.
123
+
124
+ Returns:
125
+ HandlerResult with mask and metadata.
126
+ """
127
+ # Handle empty content
128
+ if not content or not content.strip():
129
+ tokens = tokens or []
130
+ return HandlerResult(
131
+ mask=StructureMask.empty(tokens),
132
+ handler_name=self.name,
133
+ confidence=0.0,
134
+ metadata={"empty": True},
135
+ )
136
+
137
+ # Tokenize if not provided
138
+ if tokens is None:
139
+ tokens = self._tokenize(content)
140
+
141
+ # Delegate to subclass
142
+ return self._extract_mask(content, tokens, **kwargs)
143
+
144
+ def can_handle(self, content: str) -> bool:
145
+ """Check if this handler can process the content.
146
+
147
+ Default implementation returns True. Override for handlers
148
+ that need to verify content format.
149
+
150
+ Args:
151
+ content: The content to check.
152
+
153
+ Returns:
154
+ True if handler can process this content.
155
+ """
156
+ return True
157
+
158
+ @abstractmethod
159
+ def _extract_mask(
160
+ self,
161
+ content: str,
162
+ tokens: list[str],
163
+ **kwargs: Any,
164
+ ) -> HandlerResult:
165
+ """Extract structure mask from content.
166
+
167
+ Subclasses implement this to provide content-specific logic.
168
+
169
+ Args:
170
+ content: The content to analyze (non-empty, stripped).
171
+ tokens: Tokenized content.
172
+ **kwargs: Handler-specific options.
173
+
174
+ Returns:
175
+ HandlerResult with mask and metadata.
176
+ """
177
+ ...
178
+
179
+ def _tokenize(self, content: str) -> list[str]:
180
+ """Default tokenization - character-level.
181
+
182
+ Subclasses may override for more sophisticated tokenization.
183
+ For mask purposes, character-level is often sufficient and
184
+ aligns well with LLMLingua's token-level compression.
185
+
186
+ Args:
187
+ content: Content to tokenize.
188
+
189
+ Returns:
190
+ List of tokens (characters by default).
191
+ """
192
+ # Simple character-level tokenization
193
+ # This aligns well with structure detection (we mark ranges)
194
+ return list(content)
195
+
196
+
197
+ class NoOpHandler(BaseStructureHandler):
198
+ """Handler that marks everything as compressible.
199
+
200
+ Used as a fallback when no structure is detected.
201
+ """
202
+
203
+ def __init__(self) -> None:
204
+ """Initialize the no-op handler."""
205
+ super().__init__(name="noop")
206
+
207
+ def _extract_mask(
208
+ self,
209
+ content: str,
210
+ tokens: list[str],
211
+ **kwargs: Any,
212
+ ) -> HandlerResult:
213
+ """Return mask with everything compressible."""
214
+ return HandlerResult(
215
+ mask=StructureMask.empty(tokens),
216
+ handler_name=self.name,
217
+ confidence=1.0,
218
+ metadata={"reason": "no structure detected"},
219
+ )