headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,335 @@
1
+ """Content type detection for multi-format compression.
2
+
3
+ This module detects the type of tool output content to route it to the
4
+ appropriate compressor. SmartCrusher handles JSON arrays, but coding tasks
5
+ produce many other formats that need specialized handling.
6
+
7
+ Supported content types:
8
+ - JSON_ARRAY: Structured JSON data (existing SmartCrusher)
9
+ - SOURCE_CODE: Python, JavaScript, TypeScript, Go, etc.
10
+ - SEARCH_RESULTS: grep/ripgrep output (file:line:content)
11
+ - BUILD_OUTPUT: Compiler, test, lint logs
12
+ - GIT_DIFF: Unified diff format
13
+ - PLAIN_TEXT: Generic text (fallback)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import re
20
+ from dataclasses import dataclass
21
+ from enum import Enum
22
+
23
+
24
+ class ContentType(Enum):
25
+ """Types of content that can be compressed."""
26
+
27
+ JSON_ARRAY = "json_array" # Existing SmartCrusher handles this
28
+ SOURCE_CODE = "source_code" # Python, JS, TS, Go, Rust, etc.
29
+ SEARCH_RESULTS = "search" # grep/ripgrep output
30
+ BUILD_OUTPUT = "build" # Compiler, test, lint logs
31
+ GIT_DIFF = "diff" # Unified diff format
32
+ PLAIN_TEXT = "text" # Fallback
33
+
34
+
35
+ @dataclass
36
+ class DetectionResult:
37
+ """Result of content type detection."""
38
+
39
+ content_type: ContentType
40
+ confidence: float # 0.0 to 1.0
41
+ metadata: dict # Type-specific metadata (e.g., language for code)
42
+
43
+
44
+ # Patterns for detection
45
+ _SEARCH_RESULT_PATTERN = re.compile(
46
+ r"^[^\s:]+:\d+:" # file:line: format (grep -n style)
47
+ )
48
+
49
+ _DIFF_HEADER_PATTERN = re.compile(r"^(diff --git|--- a/|@@\s+-\d+,\d+\s+\+\d+,\d+\s+@@)")
50
+
51
+ _DIFF_CHANGE_PATTERN = re.compile(r"^[+-][^+-]")
52
+
53
+ # Code patterns by language
54
+ _CODE_PATTERNS = {
55
+ "python": [
56
+ re.compile(r"^\s*(def|class|import|from|async def)\s+\w+"),
57
+ re.compile(r"^\s*@\w+"), # decorators
58
+ re.compile(r'^\s*"""'), # docstrings
59
+ re.compile(r"^\s*if __name__\s*=="),
60
+ ],
61
+ "javascript": [
62
+ re.compile(r"^\s*(function|const|let|var|class|import|export)\s+"),
63
+ re.compile(r"^\s*(async\s+function|=>\s*\{)"),
64
+ re.compile(r"^\s*module\.exports"),
65
+ ],
66
+ "typescript": [
67
+ re.compile(r"^\s*(interface|type|enum|namespace)\s+\w+"),
68
+ re.compile(r":\s*(string|number|boolean|any|void)\b"),
69
+ ],
70
+ "go": [
71
+ re.compile(r"^\s*(func|type|package|import)\s+"),
72
+ re.compile(r"^\s*func\s+\([^)]+\)\s+\w+"), # method
73
+ ],
74
+ "rust": [
75
+ re.compile(r"^\s*(fn|struct|enum|impl|mod|use|pub)\s+"),
76
+ re.compile(r"^\s*#\["), # attributes
77
+ ],
78
+ "java": [
79
+ re.compile(r"^\s*(public|private|protected)\s+(class|interface|enum)"),
80
+ re.compile(r"^\s*@\w+"), # annotations
81
+ re.compile(r"^\s*package\s+[\w.]+;"),
82
+ ],
83
+ }
84
+
85
+ # Log/build output patterns
86
+ _LOG_PATTERNS = [
87
+ re.compile(r"\b(ERROR|FAIL|FAILED|FATAL|CRITICAL)\b", re.IGNORECASE),
88
+ re.compile(r"\b(WARN|WARNING)\b", re.IGNORECASE),
89
+ re.compile(r"\b(INFO|DEBUG|TRACE)\b", re.IGNORECASE),
90
+ re.compile(r"^\s*\d{4}-\d{2}-\d{2}"), # timestamp
91
+ re.compile(r"^\s*\[\d{2}:\d{2}:\d{2}\]"), # time format
92
+ re.compile(r"^={3,}|^-{3,}"), # separators
93
+ re.compile(r"^\s*PASSED|^\s*FAILED|^\s*SKIPPED"), # test results
94
+ re.compile(r"^npm ERR!|^yarn error|^cargo error"), # build tools
95
+ re.compile(r"Traceback \(most recent call last\)"), # Python traceback
96
+ re.compile(r"^\s*at\s+[\w.$]+\("), # JS/Java stack trace
97
+ ]
98
+
99
+
100
+ def detect_content_type(content: str) -> DetectionResult:
101
+ """Detect the type of content for appropriate compression.
102
+
103
+ Args:
104
+ content: The content to analyze.
105
+
106
+ Returns:
107
+ DetectionResult with type, confidence, and metadata.
108
+
109
+ Examples:
110
+ >>> result = detect_content_type('[{"id": 1}, {"id": 2}]')
111
+ >>> result.content_type
112
+ ContentType.JSON_ARRAY
113
+
114
+ >>> result = detect_content_type('src/main.py:42:def process():')
115
+ >>> result.content_type
116
+ ContentType.SEARCH_RESULTS
117
+ """
118
+ if not content or not content.strip():
119
+ return DetectionResult(ContentType.PLAIN_TEXT, 0.0, {})
120
+
121
+ # 1. Try JSON first (highest priority for SmartCrusher compatibility)
122
+ json_result = _try_detect_json(content)
123
+ if json_result:
124
+ return json_result
125
+
126
+ # 2. Check for diff (very distinctive patterns)
127
+ diff_result = _try_detect_diff(content)
128
+ if diff_result and diff_result.confidence >= 0.7:
129
+ return diff_result
130
+
131
+ # 3. Check for search results (file:line: format)
132
+ search_result = _try_detect_search(content)
133
+ if search_result and search_result.confidence >= 0.6:
134
+ return search_result
135
+
136
+ # 4. Check for build/log output
137
+ log_result = _try_detect_log(content)
138
+ if log_result and log_result.confidence >= 0.5:
139
+ return log_result
140
+
141
+ # 5. Check for source code
142
+ code_result = _try_detect_code(content)
143
+ if code_result and code_result.confidence >= 0.5:
144
+ return code_result
145
+
146
+ # 6. Fallback to plain text
147
+ return DetectionResult(ContentType.PLAIN_TEXT, 0.5, {})
148
+
149
+
150
+ def _try_detect_json(content: str) -> DetectionResult | None:
151
+ """Try to detect JSON array content."""
152
+ content = content.strip()
153
+
154
+ # Quick check: must start with [ for array
155
+ if not content.startswith("["):
156
+ return None
157
+
158
+ try:
159
+ parsed = json.loads(content)
160
+ if isinstance(parsed, list):
161
+ # Check if it's a list of dicts (SmartCrusher compatible)
162
+ if parsed and all(isinstance(item, dict) for item in parsed):
163
+ return DetectionResult(
164
+ ContentType.JSON_ARRAY,
165
+ 1.0,
166
+ {"item_count": len(parsed), "is_dict_array": True},
167
+ )
168
+ # It's a list but not of dicts
169
+ return DetectionResult(
170
+ ContentType.JSON_ARRAY,
171
+ 0.8,
172
+ {"item_count": len(parsed), "is_dict_array": False},
173
+ )
174
+ except json.JSONDecodeError:
175
+ pass
176
+
177
+ return None
178
+
179
+
180
+ def _try_detect_diff(content: str) -> DetectionResult | None:
181
+ """Try to detect git diff format."""
182
+ lines = content.split("\n")[:50] # Check first 50 lines
183
+
184
+ header_matches = 0
185
+ change_matches = 0
186
+
187
+ for line in lines:
188
+ if _DIFF_HEADER_PATTERN.match(line):
189
+ header_matches += 1
190
+ if _DIFF_CHANGE_PATTERN.match(line):
191
+ change_matches += 1
192
+
193
+ if header_matches == 0:
194
+ return None
195
+
196
+ # High confidence if we see diff headers
197
+ confidence = min(1.0, 0.5 + (header_matches * 0.2) + (change_matches * 0.05))
198
+
199
+ return DetectionResult(
200
+ ContentType.GIT_DIFF,
201
+ confidence,
202
+ {"header_matches": header_matches, "change_lines": change_matches},
203
+ )
204
+
205
+
206
+ def _try_detect_search(content: str) -> DetectionResult | None:
207
+ """Try to detect grep/ripgrep search results."""
208
+ lines = content.split("\n")[:100] # Check first 100 lines
209
+ if not lines:
210
+ return None
211
+
212
+ matching_lines = 0
213
+ for line in lines:
214
+ if line.strip() and _SEARCH_RESULT_PATTERN.match(line):
215
+ matching_lines += 1
216
+
217
+ if matching_lines == 0:
218
+ return None
219
+
220
+ # Calculate confidence based on proportion of matching lines
221
+ non_empty_lines = sum(1 for line in lines if line.strip())
222
+ if non_empty_lines == 0:
223
+ return None
224
+
225
+ ratio = matching_lines / non_empty_lines
226
+
227
+ # Need at least 30% of lines to match the pattern
228
+ if ratio < 0.3:
229
+ return None
230
+
231
+ confidence = min(1.0, 0.4 + (ratio * 0.6))
232
+
233
+ return DetectionResult(
234
+ ContentType.SEARCH_RESULTS,
235
+ confidence,
236
+ {"matching_lines": matching_lines, "total_lines": non_empty_lines},
237
+ )
238
+
239
+
240
+ def _try_detect_log(content: str) -> DetectionResult | None:
241
+ """Try to detect build/log output."""
242
+ lines = content.split("\n")[:200] # Check first 200 lines
243
+ if not lines:
244
+ return None
245
+
246
+ pattern_matches = 0
247
+ error_matches = 0
248
+
249
+ for line in lines:
250
+ for i, pattern in enumerate(_LOG_PATTERNS):
251
+ if pattern.search(line):
252
+ pattern_matches += 1
253
+ if i < 2: # ERROR or WARN patterns
254
+ error_matches += 1
255
+ break # One pattern per line is enough
256
+
257
+ if pattern_matches == 0:
258
+ return None
259
+
260
+ non_empty_lines = sum(1 for line in lines if line.strip())
261
+ if non_empty_lines == 0:
262
+ return None
263
+
264
+ ratio = pattern_matches / non_empty_lines
265
+
266
+ # Need at least 10% of lines to match log patterns
267
+ if ratio < 0.1:
268
+ return None
269
+
270
+ confidence = min(1.0, 0.3 + (ratio * 0.5) + (error_matches * 0.05))
271
+
272
+ return DetectionResult(
273
+ ContentType.BUILD_OUTPUT,
274
+ confidence,
275
+ {
276
+ "pattern_matches": pattern_matches,
277
+ "error_matches": error_matches,
278
+ "total_lines": non_empty_lines,
279
+ },
280
+ )
281
+
282
+
283
+ def _try_detect_code(content: str) -> DetectionResult | None:
284
+ """Try to detect source code and identify language."""
285
+ lines = content.split("\n")[:100] # Check first 100 lines
286
+ if not lines:
287
+ return None
288
+
289
+ language_scores: dict[str, int] = {}
290
+
291
+ for line in lines:
292
+ for lang, patterns in _CODE_PATTERNS.items():
293
+ for pattern in patterns:
294
+ if pattern.match(line):
295
+ language_scores[lang] = language_scores.get(lang, 0) + 1
296
+ break # One pattern per language per line
297
+
298
+ if not language_scores:
299
+ return None
300
+
301
+ # Find best matching language
302
+ best_lang = max(language_scores, key=lambda k: language_scores[k])
303
+ best_score = language_scores[best_lang]
304
+
305
+ # Need at least 3 pattern matches to be confident
306
+ if best_score < 3:
307
+ return None
308
+
309
+ non_empty_lines = sum(1 for line in lines if line.strip())
310
+ ratio = best_score / max(non_empty_lines, 1)
311
+
312
+ confidence = min(1.0, 0.4 + (ratio * 0.4) + (best_score * 0.02))
313
+
314
+ return DetectionResult(
315
+ ContentType.SOURCE_CODE,
316
+ confidence,
317
+ {"language": best_lang, "pattern_matches": best_score},
318
+ )
319
+
320
+
321
+ def is_json_array_of_dicts(content: str) -> bool:
322
+ """Quick check if content is a JSON array of dictionaries.
323
+
324
+ This is the format SmartCrusher can handle natively.
325
+
326
+ Args:
327
+ content: The content to check.
328
+
329
+ Returns:
330
+ True if content is a JSON array where all items are dicts.
331
+ """
332
+ result = detect_content_type(content)
333
+ return result.content_type == ContentType.JSON_ARRAY and result.metadata.get(
334
+ "is_dict_array", False
335
+ )