headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""Content type detection for multi-format compression.
|
|
2
|
+
|
|
3
|
+
This module detects the type of tool output content to route it to the
|
|
4
|
+
appropriate compressor. SmartCrusher handles JSON arrays, but coding tasks
|
|
5
|
+
produce many other formats that need specialized handling.
|
|
6
|
+
|
|
7
|
+
Supported content types:
|
|
8
|
+
- JSON_ARRAY: Structured JSON data (existing SmartCrusher)
|
|
9
|
+
- SOURCE_CODE: Python, JavaScript, TypeScript, Go, etc.
|
|
10
|
+
- SEARCH_RESULTS: grep/ripgrep output (file:line:content)
|
|
11
|
+
- BUILD_OUTPUT: Compiler, test, lint logs
|
|
12
|
+
- GIT_DIFF: Unified diff format
|
|
13
|
+
- PLAIN_TEXT: Generic text (fallback)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from enum import Enum
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ContentType(Enum):
|
|
25
|
+
"""Types of content that can be compressed."""
|
|
26
|
+
|
|
27
|
+
JSON_ARRAY = "json_array" # Existing SmartCrusher handles this
|
|
28
|
+
SOURCE_CODE = "source_code" # Python, JS, TS, Go, Rust, etc.
|
|
29
|
+
SEARCH_RESULTS = "search" # grep/ripgrep output
|
|
30
|
+
BUILD_OUTPUT = "build" # Compiler, test, lint logs
|
|
31
|
+
GIT_DIFF = "diff" # Unified diff format
|
|
32
|
+
PLAIN_TEXT = "text" # Fallback
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class DetectionResult:
|
|
37
|
+
"""Result of content type detection."""
|
|
38
|
+
|
|
39
|
+
content_type: ContentType
|
|
40
|
+
confidence: float # 0.0 to 1.0
|
|
41
|
+
metadata: dict # Type-specific metadata (e.g., language for code)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Patterns for detection
|
|
45
|
+
_SEARCH_RESULT_PATTERN = re.compile(
|
|
46
|
+
r"^[^\s:]+:\d+:" # file:line: format (grep -n style)
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
_DIFF_HEADER_PATTERN = re.compile(r"^(diff --git|--- a/|@@\s+-\d+,\d+\s+\+\d+,\d+\s+@@)")
|
|
50
|
+
|
|
51
|
+
_DIFF_CHANGE_PATTERN = re.compile(r"^[+-][^+-]")
|
|
52
|
+
|
|
53
|
+
# Code patterns by language
|
|
54
|
+
_CODE_PATTERNS = {
|
|
55
|
+
"python": [
|
|
56
|
+
re.compile(r"^\s*(def|class|import|from|async def)\s+\w+"),
|
|
57
|
+
re.compile(r"^\s*@\w+"), # decorators
|
|
58
|
+
re.compile(r'^\s*"""'), # docstrings
|
|
59
|
+
re.compile(r"^\s*if __name__\s*=="),
|
|
60
|
+
],
|
|
61
|
+
"javascript": [
|
|
62
|
+
re.compile(r"^\s*(function|const|let|var|class|import|export)\s+"),
|
|
63
|
+
re.compile(r"^\s*(async\s+function|=>\s*\{)"),
|
|
64
|
+
re.compile(r"^\s*module\.exports"),
|
|
65
|
+
],
|
|
66
|
+
"typescript": [
|
|
67
|
+
re.compile(r"^\s*(interface|type|enum|namespace)\s+\w+"),
|
|
68
|
+
re.compile(r":\s*(string|number|boolean|any|void)\b"),
|
|
69
|
+
],
|
|
70
|
+
"go": [
|
|
71
|
+
re.compile(r"^\s*(func|type|package|import)\s+"),
|
|
72
|
+
re.compile(r"^\s*func\s+\([^)]+\)\s+\w+"), # method
|
|
73
|
+
],
|
|
74
|
+
"rust": [
|
|
75
|
+
re.compile(r"^\s*(fn|struct|enum|impl|mod|use|pub)\s+"),
|
|
76
|
+
re.compile(r"^\s*#\["), # attributes
|
|
77
|
+
],
|
|
78
|
+
"java": [
|
|
79
|
+
re.compile(r"^\s*(public|private|protected)\s+(class|interface|enum)"),
|
|
80
|
+
re.compile(r"^\s*@\w+"), # annotations
|
|
81
|
+
re.compile(r"^\s*package\s+[\w.]+;"),
|
|
82
|
+
],
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Log/build output patterns
|
|
86
|
+
_LOG_PATTERNS = [
|
|
87
|
+
re.compile(r"\b(ERROR|FAIL|FAILED|FATAL|CRITICAL)\b", re.IGNORECASE),
|
|
88
|
+
re.compile(r"\b(WARN|WARNING)\b", re.IGNORECASE),
|
|
89
|
+
re.compile(r"\b(INFO|DEBUG|TRACE)\b", re.IGNORECASE),
|
|
90
|
+
re.compile(r"^\s*\d{4}-\d{2}-\d{2}"), # timestamp
|
|
91
|
+
re.compile(r"^\s*\[\d{2}:\d{2}:\d{2}\]"), # time format
|
|
92
|
+
re.compile(r"^={3,}|^-{3,}"), # separators
|
|
93
|
+
re.compile(r"^\s*PASSED|^\s*FAILED|^\s*SKIPPED"), # test results
|
|
94
|
+
re.compile(r"^npm ERR!|^yarn error|^cargo error"), # build tools
|
|
95
|
+
re.compile(r"Traceback \(most recent call last\)"), # Python traceback
|
|
96
|
+
re.compile(r"^\s*at\s+[\w.$]+\("), # JS/Java stack trace
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def detect_content_type(content: str) -> DetectionResult:
|
|
101
|
+
"""Detect the type of content for appropriate compression.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
content: The content to analyze.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
DetectionResult with type, confidence, and metadata.
|
|
108
|
+
|
|
109
|
+
Examples:
|
|
110
|
+
>>> result = detect_content_type('[{"id": 1}, {"id": 2}]')
|
|
111
|
+
>>> result.content_type
|
|
112
|
+
ContentType.JSON_ARRAY
|
|
113
|
+
|
|
114
|
+
>>> result = detect_content_type('src/main.py:42:def process():')
|
|
115
|
+
>>> result.content_type
|
|
116
|
+
ContentType.SEARCH_RESULTS
|
|
117
|
+
"""
|
|
118
|
+
if not content or not content.strip():
|
|
119
|
+
return DetectionResult(ContentType.PLAIN_TEXT, 0.0, {})
|
|
120
|
+
|
|
121
|
+
# 1. Try JSON first (highest priority for SmartCrusher compatibility)
|
|
122
|
+
json_result = _try_detect_json(content)
|
|
123
|
+
if json_result:
|
|
124
|
+
return json_result
|
|
125
|
+
|
|
126
|
+
# 2. Check for diff (very distinctive patterns)
|
|
127
|
+
diff_result = _try_detect_diff(content)
|
|
128
|
+
if diff_result and diff_result.confidence >= 0.7:
|
|
129
|
+
return diff_result
|
|
130
|
+
|
|
131
|
+
# 3. Check for search results (file:line: format)
|
|
132
|
+
search_result = _try_detect_search(content)
|
|
133
|
+
if search_result and search_result.confidence >= 0.6:
|
|
134
|
+
return search_result
|
|
135
|
+
|
|
136
|
+
# 4. Check for build/log output
|
|
137
|
+
log_result = _try_detect_log(content)
|
|
138
|
+
if log_result and log_result.confidence >= 0.5:
|
|
139
|
+
return log_result
|
|
140
|
+
|
|
141
|
+
# 5. Check for source code
|
|
142
|
+
code_result = _try_detect_code(content)
|
|
143
|
+
if code_result and code_result.confidence >= 0.5:
|
|
144
|
+
return code_result
|
|
145
|
+
|
|
146
|
+
# 6. Fallback to plain text
|
|
147
|
+
return DetectionResult(ContentType.PLAIN_TEXT, 0.5, {})
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _try_detect_json(content: str) -> DetectionResult | None:
|
|
151
|
+
"""Try to detect JSON array content."""
|
|
152
|
+
content = content.strip()
|
|
153
|
+
|
|
154
|
+
# Quick check: must start with [ for array
|
|
155
|
+
if not content.startswith("["):
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
parsed = json.loads(content)
|
|
160
|
+
if isinstance(parsed, list):
|
|
161
|
+
# Check if it's a list of dicts (SmartCrusher compatible)
|
|
162
|
+
if parsed and all(isinstance(item, dict) for item in parsed):
|
|
163
|
+
return DetectionResult(
|
|
164
|
+
ContentType.JSON_ARRAY,
|
|
165
|
+
1.0,
|
|
166
|
+
{"item_count": len(parsed), "is_dict_array": True},
|
|
167
|
+
)
|
|
168
|
+
# It's a list but not of dicts
|
|
169
|
+
return DetectionResult(
|
|
170
|
+
ContentType.JSON_ARRAY,
|
|
171
|
+
0.8,
|
|
172
|
+
{"item_count": len(parsed), "is_dict_array": False},
|
|
173
|
+
)
|
|
174
|
+
except json.JSONDecodeError:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _try_detect_diff(content: str) -> DetectionResult | None:
|
|
181
|
+
"""Try to detect git diff format."""
|
|
182
|
+
lines = content.split("\n")[:50] # Check first 50 lines
|
|
183
|
+
|
|
184
|
+
header_matches = 0
|
|
185
|
+
change_matches = 0
|
|
186
|
+
|
|
187
|
+
for line in lines:
|
|
188
|
+
if _DIFF_HEADER_PATTERN.match(line):
|
|
189
|
+
header_matches += 1
|
|
190
|
+
if _DIFF_CHANGE_PATTERN.match(line):
|
|
191
|
+
change_matches += 1
|
|
192
|
+
|
|
193
|
+
if header_matches == 0:
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
# High confidence if we see diff headers
|
|
197
|
+
confidence = min(1.0, 0.5 + (header_matches * 0.2) + (change_matches * 0.05))
|
|
198
|
+
|
|
199
|
+
return DetectionResult(
|
|
200
|
+
ContentType.GIT_DIFF,
|
|
201
|
+
confidence,
|
|
202
|
+
{"header_matches": header_matches, "change_lines": change_matches},
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _try_detect_search(content: str) -> DetectionResult | None:
|
|
207
|
+
"""Try to detect grep/ripgrep search results."""
|
|
208
|
+
lines = content.split("\n")[:100] # Check first 100 lines
|
|
209
|
+
if not lines:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
matching_lines = 0
|
|
213
|
+
for line in lines:
|
|
214
|
+
if line.strip() and _SEARCH_RESULT_PATTERN.match(line):
|
|
215
|
+
matching_lines += 1
|
|
216
|
+
|
|
217
|
+
if matching_lines == 0:
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
# Calculate confidence based on proportion of matching lines
|
|
221
|
+
non_empty_lines = sum(1 for line in lines if line.strip())
|
|
222
|
+
if non_empty_lines == 0:
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
ratio = matching_lines / non_empty_lines
|
|
226
|
+
|
|
227
|
+
# Need at least 30% of lines to match the pattern
|
|
228
|
+
if ratio < 0.3:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
confidence = min(1.0, 0.4 + (ratio * 0.6))
|
|
232
|
+
|
|
233
|
+
return DetectionResult(
|
|
234
|
+
ContentType.SEARCH_RESULTS,
|
|
235
|
+
confidence,
|
|
236
|
+
{"matching_lines": matching_lines, "total_lines": non_empty_lines},
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _try_detect_log(content: str) -> DetectionResult | None:
|
|
241
|
+
"""Try to detect build/log output."""
|
|
242
|
+
lines = content.split("\n")[:200] # Check first 200 lines
|
|
243
|
+
if not lines:
|
|
244
|
+
return None
|
|
245
|
+
|
|
246
|
+
pattern_matches = 0
|
|
247
|
+
error_matches = 0
|
|
248
|
+
|
|
249
|
+
for line in lines:
|
|
250
|
+
for i, pattern in enumerate(_LOG_PATTERNS):
|
|
251
|
+
if pattern.search(line):
|
|
252
|
+
pattern_matches += 1
|
|
253
|
+
if i < 2: # ERROR or WARN patterns
|
|
254
|
+
error_matches += 1
|
|
255
|
+
break # One pattern per line is enough
|
|
256
|
+
|
|
257
|
+
if pattern_matches == 0:
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
non_empty_lines = sum(1 for line in lines if line.strip())
|
|
261
|
+
if non_empty_lines == 0:
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
ratio = pattern_matches / non_empty_lines
|
|
265
|
+
|
|
266
|
+
# Need at least 10% of lines to match log patterns
|
|
267
|
+
if ratio < 0.1:
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
confidence = min(1.0, 0.3 + (ratio * 0.5) + (error_matches * 0.05))
|
|
271
|
+
|
|
272
|
+
return DetectionResult(
|
|
273
|
+
ContentType.BUILD_OUTPUT,
|
|
274
|
+
confidence,
|
|
275
|
+
{
|
|
276
|
+
"pattern_matches": pattern_matches,
|
|
277
|
+
"error_matches": error_matches,
|
|
278
|
+
"total_lines": non_empty_lines,
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _try_detect_code(content: str) -> DetectionResult | None:
|
|
284
|
+
"""Try to detect source code and identify language."""
|
|
285
|
+
lines = content.split("\n")[:100] # Check first 100 lines
|
|
286
|
+
if not lines:
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
language_scores: dict[str, int] = {}
|
|
290
|
+
|
|
291
|
+
for line in lines:
|
|
292
|
+
for lang, patterns in _CODE_PATTERNS.items():
|
|
293
|
+
for pattern in patterns:
|
|
294
|
+
if pattern.match(line):
|
|
295
|
+
language_scores[lang] = language_scores.get(lang, 0) + 1
|
|
296
|
+
break # One pattern per language per line
|
|
297
|
+
|
|
298
|
+
if not language_scores:
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
# Find best matching language
|
|
302
|
+
best_lang = max(language_scores, key=lambda k: language_scores[k])
|
|
303
|
+
best_score = language_scores[best_lang]
|
|
304
|
+
|
|
305
|
+
# Need at least 3 pattern matches to be confident
|
|
306
|
+
if best_score < 3:
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
non_empty_lines = sum(1 for line in lines if line.strip())
|
|
310
|
+
ratio = best_score / max(non_empty_lines, 1)
|
|
311
|
+
|
|
312
|
+
confidence = min(1.0, 0.4 + (ratio * 0.4) + (best_score * 0.02))
|
|
313
|
+
|
|
314
|
+
return DetectionResult(
|
|
315
|
+
ContentType.SOURCE_CODE,
|
|
316
|
+
confidence,
|
|
317
|
+
{"language": best_lang, "pattern_matches": best_score},
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def is_json_array_of_dicts(content: str) -> bool:
|
|
322
|
+
"""Quick check if content is a JSON array of dictionaries.
|
|
323
|
+
|
|
324
|
+
This is the format SmartCrusher can handle natively.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
content: The content to check.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
True if content is a JSON array where all items are dicts.
|
|
331
|
+
"""
|
|
332
|
+
result = detect_content_type(content)
|
|
333
|
+
return result.content_type == ContentType.JSON_ARRAY and result.metadata.get(
|
|
334
|
+
"is_dict_array", False
|
|
335
|
+
)
|