headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
"""Search results compressor for grep/ripgrep output.
|
|
2
|
+
|
|
3
|
+
This module compresses search results (grep, ripgrep, ag) which are one of
|
|
4
|
+
the most common outputs in coding tasks. Typical compression: 5-10x.
|
|
5
|
+
|
|
6
|
+
Input Format (grep -n style):
|
|
7
|
+
src/utils.py:42:def process_data(items):
|
|
8
|
+
src/utils.py:43: \"\"\"Process items with validation.\"\"\"
|
|
9
|
+
src/models.py:15:class DataProcessor:
|
|
10
|
+
|
|
11
|
+
Compression Strategy:
|
|
12
|
+
1. Parse into {file: [(line, content), ...]} structure
|
|
13
|
+
2. Group by file
|
|
14
|
+
3. For each file: keep first match, last match, context-relevant matches
|
|
15
|
+
4. Deduplicate near-identical lines
|
|
16
|
+
5. Add summary: [... and N more matches in file.py]
|
|
17
|
+
|
|
18
|
+
Integrates with CCR for reversible compression.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import re
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class SearchMatch:
|
|
29
|
+
"""A single search match."""
|
|
30
|
+
|
|
31
|
+
file: str
|
|
32
|
+
line_number: int
|
|
33
|
+
content: str
|
|
34
|
+
score: float = 0.0 # Relevance score
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class FileMatches:
|
|
39
|
+
"""All matches in a single file."""
|
|
40
|
+
|
|
41
|
+
file: str
|
|
42
|
+
matches: list[SearchMatch] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def first(self) -> SearchMatch | None:
|
|
46
|
+
return self.matches[0] if self.matches else None
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def last(self) -> SearchMatch | None:
|
|
50
|
+
return self.matches[-1] if self.matches else None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class SearchCompressorConfig:
|
|
55
|
+
"""Configuration for search result compression."""
|
|
56
|
+
|
|
57
|
+
# Per-file limits
|
|
58
|
+
max_matches_per_file: int = 5
|
|
59
|
+
always_keep_first: bool = True
|
|
60
|
+
always_keep_last: bool = True
|
|
61
|
+
|
|
62
|
+
# Global limits
|
|
63
|
+
max_total_matches: int = 30
|
|
64
|
+
max_files: int = 15
|
|
65
|
+
|
|
66
|
+
# Context matching
|
|
67
|
+
context_keywords: list[str] = field(default_factory=list)
|
|
68
|
+
boost_errors: bool = True
|
|
69
|
+
|
|
70
|
+
# CCR integration
|
|
71
|
+
enable_ccr: bool = True
|
|
72
|
+
min_matches_for_ccr: int = 10
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SearchCompressor:
|
|
76
|
+
"""Compresses grep/ripgrep search results.
|
|
77
|
+
|
|
78
|
+
Example:
|
|
79
|
+
>>> compressor = SearchCompressor()
|
|
80
|
+
>>> result = compressor.compress(search_output, context="find error handlers")
|
|
81
|
+
>>> print(result.compressed) # Reduced output with summary
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
# Pattern to parse grep-style output: file:line:content
|
|
85
|
+
_GREP_PATTERN = re.compile(r"^([^:]+):(\d+):(.*)$")
|
|
86
|
+
|
|
87
|
+
# Pattern for ripgrep with context (file-line-content or file:line:content)
|
|
88
|
+
_RG_CONTEXT_PATTERN = re.compile(r"^([^:-]+)[:-](\d+)[:-](.*)$")
|
|
89
|
+
|
|
90
|
+
# Error/important patterns to prioritize
|
|
91
|
+
_PRIORITY_PATTERNS = [
|
|
92
|
+
re.compile(r"\b(error|exception|fail|fatal)\b", re.IGNORECASE),
|
|
93
|
+
re.compile(r"\b(warn|warning)\b", re.IGNORECASE),
|
|
94
|
+
re.compile(r"\b(todo|fixme|hack|xxx)\b", re.IGNORECASE),
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
def __init__(self, config: SearchCompressorConfig | None = None):
|
|
98
|
+
"""Initialize search compressor.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
config: Compression configuration.
|
|
102
|
+
"""
|
|
103
|
+
self.config = config or SearchCompressorConfig()
|
|
104
|
+
|
|
105
|
+
def compress(
|
|
106
|
+
self,
|
|
107
|
+
content: str,
|
|
108
|
+
context: str = "",
|
|
109
|
+
) -> SearchCompressionResult:
|
|
110
|
+
"""Compress search results.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
content: Raw grep/ripgrep output.
|
|
114
|
+
context: User query context for relevance scoring.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
SearchCompressionResult with compressed output and metadata.
|
|
118
|
+
"""
|
|
119
|
+
# Parse search results
|
|
120
|
+
file_matches = self._parse_search_results(content)
|
|
121
|
+
|
|
122
|
+
if not file_matches:
|
|
123
|
+
return SearchCompressionResult(
|
|
124
|
+
compressed=content,
|
|
125
|
+
original=content,
|
|
126
|
+
original_match_count=0,
|
|
127
|
+
compressed_match_count=0,
|
|
128
|
+
files_affected=0,
|
|
129
|
+
compression_ratio=1.0,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Count original matches
|
|
133
|
+
original_count = sum(len(fm.matches) for fm in file_matches.values())
|
|
134
|
+
|
|
135
|
+
# Score matches by relevance
|
|
136
|
+
self._score_matches(file_matches, context)
|
|
137
|
+
|
|
138
|
+
# Select top matches per file
|
|
139
|
+
selected = self._select_matches(file_matches)
|
|
140
|
+
|
|
141
|
+
# Format compressed output
|
|
142
|
+
compressed, summaries = self._format_output(selected, file_matches)
|
|
143
|
+
|
|
144
|
+
# Count compressed matches
|
|
145
|
+
compressed_count = sum(len(fm.matches) for fm in selected.values())
|
|
146
|
+
|
|
147
|
+
# Calculate compression ratio
|
|
148
|
+
ratio = len(compressed) / max(len(content), 1)
|
|
149
|
+
|
|
150
|
+
# Store in CCR if significant compression
|
|
151
|
+
cache_key = None
|
|
152
|
+
if (
|
|
153
|
+
self.config.enable_ccr
|
|
154
|
+
and original_count >= self.config.min_matches_for_ccr
|
|
155
|
+
and ratio < 0.8
|
|
156
|
+
):
|
|
157
|
+
cache_key = self._store_in_ccr(content, compressed, original_count)
|
|
158
|
+
if cache_key:
|
|
159
|
+
compressed += f"\n[{original_count} matches compressed. hash={cache_key}]"
|
|
160
|
+
|
|
161
|
+
return SearchCompressionResult(
|
|
162
|
+
compressed=compressed,
|
|
163
|
+
original=content,
|
|
164
|
+
original_match_count=original_count,
|
|
165
|
+
compressed_match_count=compressed_count,
|
|
166
|
+
files_affected=len(file_matches),
|
|
167
|
+
compression_ratio=ratio,
|
|
168
|
+
cache_key=cache_key,
|
|
169
|
+
summaries=summaries,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def _parse_search_results(self, content: str) -> dict[str, FileMatches]:
|
|
173
|
+
"""Parse grep-style output into structured data."""
|
|
174
|
+
file_matches: dict[str, FileMatches] = {}
|
|
175
|
+
|
|
176
|
+
for line in content.split("\n"):
|
|
177
|
+
line = line.strip()
|
|
178
|
+
if not line:
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
# Try grep pattern first
|
|
182
|
+
match = self._GREP_PATTERN.match(line)
|
|
183
|
+
if not match:
|
|
184
|
+
match = self._RG_CONTEXT_PATTERN.match(line)
|
|
185
|
+
|
|
186
|
+
if match:
|
|
187
|
+
file_path, line_num, match_content = match.groups()
|
|
188
|
+
|
|
189
|
+
if file_path not in file_matches:
|
|
190
|
+
file_matches[file_path] = FileMatches(file=file_path)
|
|
191
|
+
|
|
192
|
+
file_matches[file_path].matches.append(
|
|
193
|
+
SearchMatch(
|
|
194
|
+
file=file_path,
|
|
195
|
+
line_number=int(line_num),
|
|
196
|
+
content=match_content,
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return file_matches
|
|
201
|
+
|
|
202
|
+
def _score_matches(
|
|
203
|
+
self,
|
|
204
|
+
file_matches: dict[str, FileMatches],
|
|
205
|
+
context: str,
|
|
206
|
+
) -> None:
|
|
207
|
+
"""Score matches by relevance to context."""
|
|
208
|
+
context_lower = context.lower()
|
|
209
|
+
context_words = set(context_lower.split())
|
|
210
|
+
|
|
211
|
+
for fm in file_matches.values():
|
|
212
|
+
for match in fm.matches:
|
|
213
|
+
score = 0.0
|
|
214
|
+
content_lower = match.content.lower()
|
|
215
|
+
|
|
216
|
+
# Score by context word overlap
|
|
217
|
+
for word in context_words:
|
|
218
|
+
if len(word) > 2 and word in content_lower:
|
|
219
|
+
score += 0.3
|
|
220
|
+
|
|
221
|
+
# Boost error/warning patterns
|
|
222
|
+
if self.config.boost_errors:
|
|
223
|
+
for i, pattern in enumerate(self._PRIORITY_PATTERNS):
|
|
224
|
+
if pattern.search(match.content):
|
|
225
|
+
score += 0.5 - (i * 0.1) # Higher boost for errors
|
|
226
|
+
|
|
227
|
+
# Boost for keyword matches
|
|
228
|
+
for keyword in self.config.context_keywords:
|
|
229
|
+
if keyword.lower() in content_lower:
|
|
230
|
+
score += 0.4
|
|
231
|
+
|
|
232
|
+
match.score = min(1.0, score)
|
|
233
|
+
|
|
234
|
+
def _select_matches(
|
|
235
|
+
self,
|
|
236
|
+
file_matches: dict[str, FileMatches],
|
|
237
|
+
) -> dict[str, FileMatches]:
|
|
238
|
+
"""Select top matches per file and globally."""
|
|
239
|
+
selected: dict[str, FileMatches] = {}
|
|
240
|
+
|
|
241
|
+
# Sort files by total match score (highest first)
|
|
242
|
+
sorted_files = sorted(
|
|
243
|
+
file_matches.items(),
|
|
244
|
+
key=lambda x: sum(m.score for m in x[1].matches),
|
|
245
|
+
reverse=True,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Limit number of files
|
|
249
|
+
sorted_files = sorted_files[: self.config.max_files]
|
|
250
|
+
|
|
251
|
+
total_selected = 0
|
|
252
|
+
for file_path, fm in sorted_files:
|
|
253
|
+
if total_selected >= self.config.max_total_matches:
|
|
254
|
+
break
|
|
255
|
+
|
|
256
|
+
# Sort matches by score
|
|
257
|
+
sorted_matches = sorted(fm.matches, key=lambda m: m.score, reverse=True)
|
|
258
|
+
|
|
259
|
+
# Select matches for this file
|
|
260
|
+
file_selected: list[SearchMatch] = []
|
|
261
|
+
remaining_slots = min(
|
|
262
|
+
self.config.max_matches_per_file,
|
|
263
|
+
self.config.max_total_matches - total_selected,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Always include first and last if configured
|
|
267
|
+
if self.config.always_keep_first and fm.first:
|
|
268
|
+
file_selected.append(fm.first)
|
|
269
|
+
remaining_slots -= 1
|
|
270
|
+
|
|
271
|
+
if (
|
|
272
|
+
self.config.always_keep_last
|
|
273
|
+
and fm.last
|
|
274
|
+
and fm.last != fm.first
|
|
275
|
+
and remaining_slots > 0
|
|
276
|
+
):
|
|
277
|
+
file_selected.append(fm.last)
|
|
278
|
+
remaining_slots -= 1
|
|
279
|
+
|
|
280
|
+
# Fill remaining slots with highest-scoring matches
|
|
281
|
+
for match in sorted_matches:
|
|
282
|
+
if remaining_slots <= 0:
|
|
283
|
+
break
|
|
284
|
+
if match not in file_selected:
|
|
285
|
+
file_selected.append(match)
|
|
286
|
+
remaining_slots -= 1
|
|
287
|
+
|
|
288
|
+
# Sort by line number for output
|
|
289
|
+
file_selected.sort(key=lambda m: m.line_number)
|
|
290
|
+
|
|
291
|
+
selected[file_path] = FileMatches(file=file_path, matches=file_selected)
|
|
292
|
+
total_selected += len(file_selected)
|
|
293
|
+
|
|
294
|
+
return selected
|
|
295
|
+
|
|
296
|
+
def _format_output(
|
|
297
|
+
self,
|
|
298
|
+
selected: dict[str, FileMatches],
|
|
299
|
+
original: dict[str, FileMatches],
|
|
300
|
+
) -> tuple[str, dict[str, str]]:
|
|
301
|
+
"""Format selected matches back to grep-style output."""
|
|
302
|
+
lines: list[str] = []
|
|
303
|
+
summaries: dict[str, str] = {}
|
|
304
|
+
|
|
305
|
+
for file_path, fm in sorted(selected.items()):
|
|
306
|
+
for match in fm.matches:
|
|
307
|
+
lines.append(f"{match.file}:{match.line_number}:{match.content}")
|
|
308
|
+
|
|
309
|
+
# Add summary if matches were omitted
|
|
310
|
+
original_fm = original.get(file_path)
|
|
311
|
+
if original_fm and len(original_fm.matches) > len(fm.matches):
|
|
312
|
+
omitted = len(original_fm.matches) - len(fm.matches)
|
|
313
|
+
summary = f"[... and {omitted} more matches in {file_path}]"
|
|
314
|
+
lines.append(summary)
|
|
315
|
+
summaries[file_path] = summary
|
|
316
|
+
|
|
317
|
+
return "\n".join(lines), summaries
|
|
318
|
+
|
|
319
|
+
def _store_in_ccr(
|
|
320
|
+
self,
|
|
321
|
+
original: str,
|
|
322
|
+
compressed: str,
|
|
323
|
+
original_count: int,
|
|
324
|
+
) -> str | None:
|
|
325
|
+
"""Store original in CCR for later retrieval."""
|
|
326
|
+
try:
|
|
327
|
+
from ..cache.compression_store import get_compression_store
|
|
328
|
+
|
|
329
|
+
store = get_compression_store()
|
|
330
|
+
return store.store(
|
|
331
|
+
original,
|
|
332
|
+
compressed,
|
|
333
|
+
original_item_count=original_count,
|
|
334
|
+
)
|
|
335
|
+
except ImportError:
|
|
336
|
+
# CCR not available
|
|
337
|
+
return None
|
|
338
|
+
except Exception:
|
|
339
|
+
# Silently fail CCR storage
|
|
340
|
+
return None
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
@dataclass
|
|
344
|
+
class SearchCompressionResult:
|
|
345
|
+
"""Result of search result compression."""
|
|
346
|
+
|
|
347
|
+
compressed: str
|
|
348
|
+
original: str
|
|
349
|
+
original_match_count: int
|
|
350
|
+
compressed_match_count: int
|
|
351
|
+
files_affected: int
|
|
352
|
+
compression_ratio: float
|
|
353
|
+
cache_key: str | None = None
|
|
354
|
+
summaries: dict[str, str] = field(default_factory=dict)
|
|
355
|
+
|
|
356
|
+
@property
|
|
357
|
+
def tokens_saved_estimate(self) -> int:
|
|
358
|
+
"""Estimate tokens saved (rough: 1 token per 4 chars)."""
|
|
359
|
+
chars_saved = len(self.original) - len(self.compressed)
|
|
360
|
+
return max(0, chars_saved // 4)
|
|
361
|
+
|
|
362
|
+
@property
|
|
363
|
+
def matches_omitted(self) -> int:
|
|
364
|
+
"""Number of matches omitted."""
|
|
365
|
+
return self.original_match_count - self.compressed_match_count
|