headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,764 @@
|
|
|
1
|
+
"""TelemetryCollector for privacy-preserving statistics collection.
|
|
2
|
+
|
|
3
|
+
This module collects anonymized statistics about compression patterns
|
|
4
|
+
to enable cross-user learning and improve compression over time.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import threading
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from .models import (
|
|
19
|
+
AnonymizedToolStats,
|
|
20
|
+
CompressionEvent,
|
|
21
|
+
FieldDistribution,
|
|
22
|
+
RetrievalStats,
|
|
23
|
+
ToolSignature,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class TelemetryConfig:
|
|
29
|
+
"""Configuration for telemetry collection."""
|
|
30
|
+
|
|
31
|
+
# Enable/disable telemetry
|
|
32
|
+
enabled: bool = True
|
|
33
|
+
|
|
34
|
+
# Storage
|
|
35
|
+
storage_path: str | None = None # Path to store telemetry data (None = in-memory only)
|
|
36
|
+
auto_save_interval: int = 300 # Auto-save every N seconds (0 = disabled)
|
|
37
|
+
|
|
38
|
+
# Privacy settings
|
|
39
|
+
anonymize_tool_names: bool = True # Hash tool names
|
|
40
|
+
collect_field_names: bool = False # If False, only collect field hashes
|
|
41
|
+
collect_timing: bool = True # Collect processing time
|
|
42
|
+
|
|
43
|
+
# Aggregation settings
|
|
44
|
+
max_events_in_memory: int = 10000 # Max events to keep in memory
|
|
45
|
+
min_samples_for_recommendation: int = 10 # Min samples before making recommendations
|
|
46
|
+
|
|
47
|
+
# Export settings
|
|
48
|
+
include_field_distributions: bool = True # Include detailed field stats in export
|
|
49
|
+
include_recommendations: bool = True # Include learned recommendations
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TelemetryCollector:
|
|
53
|
+
"""Collects and aggregates compression telemetry.
|
|
54
|
+
|
|
55
|
+
Thread-safe collector that maintains anonymized statistics about
|
|
56
|
+
compression patterns. Can be used to:
|
|
57
|
+
- Understand what tool outputs look like (structurally)
|
|
58
|
+
- Track which compression strategies work best
|
|
59
|
+
- Learn optimal settings per tool type
|
|
60
|
+
- Export data for cross-user aggregation
|
|
61
|
+
|
|
62
|
+
Privacy guarantees:
|
|
63
|
+
- No actual data values are stored
|
|
64
|
+
- Tool names are hashed by default
|
|
65
|
+
- Field names can be hashed
|
|
66
|
+
- No user identifiers
|
|
67
|
+
- No query content
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self, config: TelemetryConfig | None = None):
|
|
71
|
+
"""Initialize the telemetry collector.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
config: Configuration options. Uses defaults if not provided.
|
|
75
|
+
"""
|
|
76
|
+
self._config = config or TelemetryConfig()
|
|
77
|
+
self._lock = threading.Lock()
|
|
78
|
+
|
|
79
|
+
# Event storage
|
|
80
|
+
self._events: list[CompressionEvent] = []
|
|
81
|
+
|
|
82
|
+
# Aggregated stats per tool signature
|
|
83
|
+
self._tool_stats: dict[str, AnonymizedToolStats] = {}
|
|
84
|
+
|
|
85
|
+
# Retrieval tracking
|
|
86
|
+
self._retrieval_stats: dict[str, RetrievalStats] = {}
|
|
87
|
+
|
|
88
|
+
# Global counters
|
|
89
|
+
self._total_compressions: int = 0
|
|
90
|
+
self._total_retrievals: int = 0
|
|
91
|
+
self._total_tokens_saved: int = 0
|
|
92
|
+
|
|
93
|
+
# Auto-save tracking
|
|
94
|
+
self._last_save_time: float = time.time()
|
|
95
|
+
self._dirty: bool = False
|
|
96
|
+
|
|
97
|
+
# Load existing data if storage path exists
|
|
98
|
+
if self._config.storage_path:
|
|
99
|
+
self._load_from_disk()
|
|
100
|
+
|
|
101
|
+
def record_compression(
|
|
102
|
+
self,
|
|
103
|
+
items: list[dict[str, Any]],
|
|
104
|
+
original_count: int,
|
|
105
|
+
compressed_count: int,
|
|
106
|
+
original_tokens: int,
|
|
107
|
+
compressed_tokens: int,
|
|
108
|
+
strategy: str,
|
|
109
|
+
*,
|
|
110
|
+
tool_name: str | None = None,
|
|
111
|
+
strategy_reason: str | None = None,
|
|
112
|
+
crushability_score: float | None = None,
|
|
113
|
+
crushability_reason: str | None = None,
|
|
114
|
+
kept_first_n: int = 0,
|
|
115
|
+
kept_last_n: int = 0,
|
|
116
|
+
kept_errors: int = 0,
|
|
117
|
+
kept_anomalies: int = 0,
|
|
118
|
+
kept_by_relevance: int = 0,
|
|
119
|
+
kept_by_score: int = 0,
|
|
120
|
+
processing_time_ms: float = 0.0,
|
|
121
|
+
) -> None:
|
|
122
|
+
"""Record a compression event.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
items: Sample items from the original array (for structure analysis).
|
|
126
|
+
original_count: Original number of items.
|
|
127
|
+
compressed_count: Number of items after compression.
|
|
128
|
+
original_tokens: Original token count.
|
|
129
|
+
compressed_tokens: Compressed token count.
|
|
130
|
+
strategy: Compression strategy used.
|
|
131
|
+
tool_name: Optional tool name (will be hashed if configured).
|
|
132
|
+
strategy_reason: Why this strategy was chosen.
|
|
133
|
+
crushability_score: Crushability analysis score.
|
|
134
|
+
crushability_reason: Crushability analysis reason.
|
|
135
|
+
kept_first_n: Items kept from start.
|
|
136
|
+
kept_last_n: Items kept from end.
|
|
137
|
+
kept_errors: Error items kept.
|
|
138
|
+
kept_anomalies: Anomalous items kept.
|
|
139
|
+
kept_by_relevance: Items kept by relevance score.
|
|
140
|
+
kept_by_score: Items kept by score field.
|
|
141
|
+
processing_time_ms: Processing time in milliseconds.
|
|
142
|
+
"""
|
|
143
|
+
if not self._config.enabled:
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
# Create tool signature from items
|
|
147
|
+
signature = ToolSignature.from_items(items[:10]) # Sample first 10
|
|
148
|
+
|
|
149
|
+
# Analyze field distributions
|
|
150
|
+
field_distributions: list[FieldDistribution] = []
|
|
151
|
+
if self._config.include_field_distributions and items:
|
|
152
|
+
field_distributions = self._analyze_fields(items[:100]) # Sample 100
|
|
153
|
+
|
|
154
|
+
# Calculate ratios
|
|
155
|
+
compression_ratio = compressed_count / original_count if original_count > 0 else 0.0
|
|
156
|
+
token_reduction = 1 - (compressed_tokens / original_tokens) if original_tokens > 0 else 0.0
|
|
157
|
+
|
|
158
|
+
# Create event
|
|
159
|
+
event = CompressionEvent(
|
|
160
|
+
tool_signature=signature,
|
|
161
|
+
original_item_count=original_count,
|
|
162
|
+
compressed_item_count=compressed_count,
|
|
163
|
+
compression_ratio=compression_ratio,
|
|
164
|
+
original_tokens=original_tokens,
|
|
165
|
+
compressed_tokens=compressed_tokens,
|
|
166
|
+
token_reduction_ratio=token_reduction,
|
|
167
|
+
strategy=strategy,
|
|
168
|
+
strategy_reason=strategy_reason,
|
|
169
|
+
crushability_score=crushability_score,
|
|
170
|
+
crushability_reason=crushability_reason,
|
|
171
|
+
field_distributions=field_distributions,
|
|
172
|
+
kept_first_n=kept_first_n,
|
|
173
|
+
kept_last_n=kept_last_n,
|
|
174
|
+
kept_errors=kept_errors,
|
|
175
|
+
kept_anomalies=kept_anomalies,
|
|
176
|
+
kept_by_relevance=kept_by_relevance,
|
|
177
|
+
kept_by_score=kept_by_score,
|
|
178
|
+
timestamp=time.time(),
|
|
179
|
+
processing_time_ms=processing_time_ms,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
should_save = False
|
|
183
|
+
with self._lock:
|
|
184
|
+
# Store event
|
|
185
|
+
self._events.append(event)
|
|
186
|
+
if len(self._events) > self._config.max_events_in_memory:
|
|
187
|
+
self._events = self._events[-self._config.max_events_in_memory :]
|
|
188
|
+
|
|
189
|
+
# Update aggregated stats
|
|
190
|
+
self._update_tool_stats(signature, event)
|
|
191
|
+
|
|
192
|
+
# Update global counters
|
|
193
|
+
self._total_compressions += 1
|
|
194
|
+
self._total_tokens_saved += original_tokens - compressed_tokens
|
|
195
|
+
self._dirty = True
|
|
196
|
+
|
|
197
|
+
# Check if auto-save needed (don't actually save while holding lock)
|
|
198
|
+
should_save = self._should_auto_save()
|
|
199
|
+
|
|
200
|
+
# Auto-save outside lock to avoid blocking other operations
|
|
201
|
+
if should_save:
|
|
202
|
+
self.save()
|
|
203
|
+
|
|
204
|
+
def record_retrieval(
|
|
205
|
+
self,
|
|
206
|
+
tool_signature_hash: str,
|
|
207
|
+
retrieval_type: str, # "full" or "search"
|
|
208
|
+
query_fields: list[str] | None = None,
|
|
209
|
+
) -> None:
|
|
210
|
+
"""Record a retrieval event.
|
|
211
|
+
|
|
212
|
+
This is called when an LLM retrieves compressed content, indicating
|
|
213
|
+
the compression may have been too aggressive.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
tool_signature_hash: Hash of the tool signature.
|
|
217
|
+
retrieval_type: "full" (retrieved everything) or "search" (filtered).
|
|
218
|
+
query_fields: Field names mentioned in search query (will be hashed).
|
|
219
|
+
"""
|
|
220
|
+
if not self._config.enabled:
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
with self._lock:
|
|
224
|
+
# Get or create retrieval stats
|
|
225
|
+
if tool_signature_hash not in self._retrieval_stats:
|
|
226
|
+
self._retrieval_stats[tool_signature_hash] = RetrievalStats(
|
|
227
|
+
tool_signature_hash=tool_signature_hash
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
stats = self._retrieval_stats[tool_signature_hash]
|
|
231
|
+
stats.total_retrievals += 1
|
|
232
|
+
|
|
233
|
+
if retrieval_type == "full":
|
|
234
|
+
stats.full_retrievals += 1
|
|
235
|
+
else:
|
|
236
|
+
stats.search_retrievals += 1
|
|
237
|
+
|
|
238
|
+
# Track queried fields (anonymized)
|
|
239
|
+
if query_fields:
|
|
240
|
+
for field_name in query_fields:
|
|
241
|
+
field_hash = self._hash_field_name(field_name)
|
|
242
|
+
stats.query_field_frequency[field_hash] = (
|
|
243
|
+
stats.query_field_frequency.get(field_hash, 0) + 1
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Update global counter
|
|
247
|
+
self._total_retrievals += 1
|
|
248
|
+
self._dirty = True
|
|
249
|
+
|
|
250
|
+
# Update tool stats with retrieval info
|
|
251
|
+
if tool_signature_hash in self._tool_stats:
|
|
252
|
+
self._tool_stats[tool_signature_hash].retrieval_stats = stats
|
|
253
|
+
self._update_recommendations(tool_signature_hash)
|
|
254
|
+
|
|
255
|
+
def get_stats(self) -> dict[str, Any]:
|
|
256
|
+
"""Get overall telemetry statistics.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Dictionary with aggregated statistics.
|
|
260
|
+
"""
|
|
261
|
+
with self._lock:
|
|
262
|
+
return {
|
|
263
|
+
"enabled": self._config.enabled,
|
|
264
|
+
"total_compressions": self._total_compressions,
|
|
265
|
+
"total_retrievals": self._total_retrievals,
|
|
266
|
+
"total_tokens_saved": self._total_tokens_saved,
|
|
267
|
+
"global_retrieval_rate": (
|
|
268
|
+
self._total_retrievals / self._total_compressions
|
|
269
|
+
if self._total_compressions > 0
|
|
270
|
+
else 0.0
|
|
271
|
+
),
|
|
272
|
+
"tool_signatures_tracked": len(self._tool_stats),
|
|
273
|
+
"events_in_memory": len(self._events),
|
|
274
|
+
"avg_compression_ratio": self._calculate_avg_compression_ratio(),
|
|
275
|
+
"avg_token_reduction": self._calculate_avg_token_reduction(),
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
def get_tool_stats(self, signature_hash: str) -> AnonymizedToolStats | None:
|
|
279
|
+
"""Get statistics for a specific tool signature.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
signature_hash: The tool signature hash.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
AnonymizedToolStats if found, None otherwise.
|
|
286
|
+
"""
|
|
287
|
+
with self._lock:
|
|
288
|
+
return self._tool_stats.get(signature_hash)
|
|
289
|
+
|
|
290
|
+
def get_all_tool_stats(self) -> dict[str, AnonymizedToolStats]:
|
|
291
|
+
"""Get statistics for all tracked tool signatures.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Dictionary mapping signature hash to stats.
|
|
295
|
+
"""
|
|
296
|
+
with self._lock:
|
|
297
|
+
return dict(self._tool_stats)
|
|
298
|
+
|
|
299
|
+
def get_recommendations(self, signature_hash: str) -> dict[str, Any] | None:
|
|
300
|
+
"""Get learned recommendations for a tool signature.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
signature_hash: The tool signature hash.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
Recommendations dictionary if available, None otherwise.
|
|
307
|
+
"""
|
|
308
|
+
with self._lock:
|
|
309
|
+
stats = self._tool_stats.get(signature_hash)
|
|
310
|
+
if not stats or stats.sample_size < self._config.min_samples_for_recommendation:
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
return {
|
|
314
|
+
"signature_hash": signature_hash,
|
|
315
|
+
"recommended_min_items": stats.recommended_min_items,
|
|
316
|
+
"recommended_preserve_fields": stats.recommended_preserve_fields,
|
|
317
|
+
"skip_compression_recommended": stats.skip_compression_recommended,
|
|
318
|
+
"confidence": stats.confidence,
|
|
319
|
+
"based_on_samples": stats.sample_size,
|
|
320
|
+
"retrieval_rate": (
|
|
321
|
+
stats.retrieval_stats.retrieval_rate if stats.retrieval_stats else None
|
|
322
|
+
),
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
def export_stats(self) -> dict[str, Any]:
|
|
326
|
+
"""Export all telemetry data for aggregation.
|
|
327
|
+
|
|
328
|
+
This is the data that can be sent to a central server for
|
|
329
|
+
cross-user learning (with user consent).
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Complete telemetry export.
|
|
333
|
+
"""
|
|
334
|
+
with self._lock:
|
|
335
|
+
export = {
|
|
336
|
+
"version": "1.0",
|
|
337
|
+
"export_timestamp": time.time(),
|
|
338
|
+
"summary": {
|
|
339
|
+
"total_compressions": self._total_compressions,
|
|
340
|
+
"total_retrievals": self._total_retrievals,
|
|
341
|
+
"total_tokens_saved": self._total_tokens_saved,
|
|
342
|
+
"tool_signatures_tracked": len(self._tool_stats),
|
|
343
|
+
},
|
|
344
|
+
"tool_stats": {
|
|
345
|
+
sig_hash: stats.to_dict() for sig_hash, stats in self._tool_stats.items()
|
|
346
|
+
},
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
if self._config.include_recommendations:
|
|
350
|
+
export["recommendations"] = {
|
|
351
|
+
sig_hash: {
|
|
352
|
+
"recommended_min_items": stats.recommended_min_items,
|
|
353
|
+
"skip_compression_recommended": stats.skip_compression_recommended,
|
|
354
|
+
"confidence": stats.confidence,
|
|
355
|
+
}
|
|
356
|
+
for sig_hash, stats in self._tool_stats.items()
|
|
357
|
+
if stats.sample_size >= self._config.min_samples_for_recommendation
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return export
|
|
361
|
+
|
|
362
|
+
def import_stats(self, data: dict[str, Any]) -> None:
|
|
363
|
+
"""Import telemetry data from another source.
|
|
364
|
+
|
|
365
|
+
This allows merging stats from multiple users for cross-user learning.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
data: Exported telemetry data.
|
|
369
|
+
"""
|
|
370
|
+
if not self._config.enabled:
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
with self._lock:
|
|
374
|
+
# Import summary counters
|
|
375
|
+
summary = data.get("summary", {})
|
|
376
|
+
self._total_compressions += summary.get("total_compressions", 0)
|
|
377
|
+
self._total_retrievals += summary.get("total_retrievals", 0)
|
|
378
|
+
self._total_tokens_saved += summary.get("total_tokens_saved", 0)
|
|
379
|
+
|
|
380
|
+
# Import tool stats
|
|
381
|
+
tool_stats_data = data.get("tool_stats", {})
|
|
382
|
+
for sig_hash, stats_dict in tool_stats_data.items():
|
|
383
|
+
if sig_hash in self._tool_stats:
|
|
384
|
+
# Merge with existing
|
|
385
|
+
existing = self._tool_stats[sig_hash]
|
|
386
|
+
imported = AnonymizedToolStats.from_dict(stats_dict)
|
|
387
|
+
self._merge_tool_stats(existing, imported)
|
|
388
|
+
else:
|
|
389
|
+
# Add new
|
|
390
|
+
self._tool_stats[sig_hash] = AnonymizedToolStats.from_dict(stats_dict)
|
|
391
|
+
|
|
392
|
+
self._dirty = True
|
|
393
|
+
|
|
394
|
+
def clear(self) -> None:
|
|
395
|
+
"""Clear all telemetry data. Mainly for testing."""
|
|
396
|
+
with self._lock:
|
|
397
|
+
self._events.clear()
|
|
398
|
+
self._tool_stats.clear()
|
|
399
|
+
self._retrieval_stats.clear()
|
|
400
|
+
self._total_compressions = 0
|
|
401
|
+
self._total_retrievals = 0
|
|
402
|
+
self._total_tokens_saved = 0
|
|
403
|
+
self._dirty = False
|
|
404
|
+
|
|
405
|
+
def save(self) -> None:
|
|
406
|
+
"""Save telemetry data to disk."""
|
|
407
|
+
if not self._config.storage_path:
|
|
408
|
+
return
|
|
409
|
+
|
|
410
|
+
with self._lock:
|
|
411
|
+
# Build export data inline to avoid deadlock (export_stats also acquires lock)
|
|
412
|
+
data = {
|
|
413
|
+
"version": "1.0",
|
|
414
|
+
"export_timestamp": time.time(),
|
|
415
|
+
"summary": {
|
|
416
|
+
"total_compressions": self._total_compressions,
|
|
417
|
+
"total_retrievals": self._total_retrievals,
|
|
418
|
+
"total_tokens_saved": self._total_tokens_saved,
|
|
419
|
+
"tool_signatures_tracked": len(self._tool_stats),
|
|
420
|
+
},
|
|
421
|
+
"tool_stats": {
|
|
422
|
+
sig_hash: stats.to_dict() for sig_hash, stats in self._tool_stats.items()
|
|
423
|
+
},
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
if self._config.include_recommendations:
|
|
427
|
+
data["recommendations"] = {
|
|
428
|
+
sig_hash: {
|
|
429
|
+
"recommended_min_items": stats.recommended_min_items,
|
|
430
|
+
"skip_compression_recommended": stats.skip_compression_recommended,
|
|
431
|
+
"confidence": stats.confidence,
|
|
432
|
+
}
|
|
433
|
+
for sig_hash, stats in self._tool_stats.items()
|
|
434
|
+
if stats.sample_size >= self._config.min_samples_for_recommendation
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
path = Path(self._config.storage_path)
|
|
438
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
439
|
+
|
|
440
|
+
with open(path, "w") as f:
|
|
441
|
+
json.dump(data, f, indent=2)
|
|
442
|
+
|
|
443
|
+
self._dirty = False
|
|
444
|
+
self._last_save_time = time.time()
|
|
445
|
+
|
|
446
|
+
def _load_from_disk(self) -> None:
|
|
447
|
+
"""Load telemetry data from disk."""
|
|
448
|
+
if not self._config.storage_path:
|
|
449
|
+
return
|
|
450
|
+
|
|
451
|
+
path = Path(self._config.storage_path)
|
|
452
|
+
if not path.exists():
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
try:
|
|
456
|
+
with open(path) as f:
|
|
457
|
+
data = json.load(f)
|
|
458
|
+
self.import_stats(data)
|
|
459
|
+
self._dirty = False
|
|
460
|
+
except (json.JSONDecodeError, OSError):
|
|
461
|
+
pass # Start fresh if file is corrupted
|
|
462
|
+
|
|
463
|
+
def _analyze_fields(self, items: list[dict[str, Any]]) -> list[FieldDistribution]:
|
|
464
|
+
"""Analyze field distributions in items."""
|
|
465
|
+
if not items:
|
|
466
|
+
return []
|
|
467
|
+
|
|
468
|
+
distributions: list[FieldDistribution] = []
|
|
469
|
+
|
|
470
|
+
# Get all field names from first item
|
|
471
|
+
sample = items[0] if isinstance(items[0], dict) else {}
|
|
472
|
+
for field_name, _sample_value in sample.items():
|
|
473
|
+
# Collect all values for this field
|
|
474
|
+
values = [
|
|
475
|
+
item.get(field_name)
|
|
476
|
+
for item in items
|
|
477
|
+
if isinstance(item, dict) and field_name in item
|
|
478
|
+
]
|
|
479
|
+
|
|
480
|
+
if not values:
|
|
481
|
+
continue
|
|
482
|
+
|
|
483
|
+
dist = self._create_field_distribution(field_name, values)
|
|
484
|
+
distributions.append(dist)
|
|
485
|
+
|
|
486
|
+
return distributions
|
|
487
|
+
|
|
488
|
+
def _create_field_distribution(
|
|
489
|
+
self,
|
|
490
|
+
field_name: str,
|
|
491
|
+
values: list[Any],
|
|
492
|
+
) -> FieldDistribution:
|
|
493
|
+
"""Create a FieldDistribution from values."""
|
|
494
|
+
field_hash = self._hash_field_name(field_name)
|
|
495
|
+
|
|
496
|
+
# Determine type
|
|
497
|
+
type_counts: dict[str, int] = {}
|
|
498
|
+
for v in values:
|
|
499
|
+
if isinstance(v, str):
|
|
500
|
+
type_counts["string"] = type_counts.get("string", 0) + 1
|
|
501
|
+
elif isinstance(v, bool):
|
|
502
|
+
type_counts["boolean"] = type_counts.get("boolean", 0) + 1
|
|
503
|
+
elif isinstance(v, (int, float)):
|
|
504
|
+
type_counts["numeric"] = type_counts.get("numeric", 0) + 1
|
|
505
|
+
elif isinstance(v, list):
|
|
506
|
+
type_counts["array"] = type_counts.get("array", 0) + 1
|
|
507
|
+
elif isinstance(v, dict):
|
|
508
|
+
type_counts["object"] = type_counts.get("object", 0) + 1
|
|
509
|
+
elif v is None:
|
|
510
|
+
type_counts["null"] = type_counts.get("null", 0) + 1
|
|
511
|
+
|
|
512
|
+
# Get dominant type
|
|
513
|
+
if not type_counts:
|
|
514
|
+
field_type = "null"
|
|
515
|
+
elif len(type_counts) > 1:
|
|
516
|
+
field_type = "mixed"
|
|
517
|
+
else:
|
|
518
|
+
field_type = list(type_counts.keys())[0]
|
|
519
|
+
|
|
520
|
+
dist = FieldDistribution(
|
|
521
|
+
field_name_hash=field_hash,
|
|
522
|
+
field_type=field_type, # type: ignore[arg-type]
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
# Type-specific analysis
|
|
526
|
+
if field_type == "string":
|
|
527
|
+
str_values = [v for v in values if isinstance(v, str)]
|
|
528
|
+
if str_values:
|
|
529
|
+
dist.avg_length = sum(len(s) for s in str_values) / len(str_values)
|
|
530
|
+
unique_count = len(set(str_values))
|
|
531
|
+
dist.unique_ratio = unique_count / len(str_values)
|
|
532
|
+
dist.looks_like_id = dist.unique_ratio > 0.9 and dist.avg_length > 5
|
|
533
|
+
|
|
534
|
+
elif field_type == "numeric":
|
|
535
|
+
num_values = [v for v in values if isinstance(v, (int, float))]
|
|
536
|
+
# Filter out infinity and NaN which can cause issues
|
|
537
|
+
num_values = [
|
|
538
|
+
v
|
|
539
|
+
for v in num_values
|
|
540
|
+
if not (
|
|
541
|
+
isinstance(v, float) and (v != v or v == float("inf") or v == float("-inf"))
|
|
542
|
+
)
|
|
543
|
+
]
|
|
544
|
+
if num_values:
|
|
545
|
+
dist.has_negative = any(v < 0 for v in num_values)
|
|
546
|
+
# Safe integer check (avoid OverflowError from int(inf))
|
|
547
|
+
dist.is_integer = all(
|
|
548
|
+
isinstance(v, int) or (isinstance(v, float) and v.is_integer())
|
|
549
|
+
for v in num_values
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
if len(num_values) > 1:
|
|
553
|
+
mean = sum(num_values) / len(num_values)
|
|
554
|
+
variance = sum((v - mean) ** 2 for v in num_values) / len(num_values)
|
|
555
|
+
dist.has_variance = variance > 0
|
|
556
|
+
|
|
557
|
+
if variance == 0:
|
|
558
|
+
dist.variance_bucket = "zero"
|
|
559
|
+
elif variance < 10:
|
|
560
|
+
dist.variance_bucket = "low"
|
|
561
|
+
elif variance < 1000:
|
|
562
|
+
dist.variance_bucket = "medium"
|
|
563
|
+
else:
|
|
564
|
+
dist.variance_bucket = "high"
|
|
565
|
+
|
|
566
|
+
# Check for outliers
|
|
567
|
+
std = variance**0.5
|
|
568
|
+
if std > 0:
|
|
569
|
+
outliers = sum(1 for v in num_values if abs(v - mean) > 2 * std)
|
|
570
|
+
dist.has_outliers = outliers > 0
|
|
571
|
+
|
|
572
|
+
# Pattern detection
|
|
573
|
+
sorted_vals = sorted(num_values)
|
|
574
|
+
is_monotonic = (
|
|
575
|
+
sorted_vals == num_values or list(reversed(sorted_vals)) == num_values
|
|
576
|
+
)
|
|
577
|
+
if is_monotonic and dist.variance_bucket in ("medium", "high"):
|
|
578
|
+
dist.is_likely_score = True
|
|
579
|
+
|
|
580
|
+
elif field_type == "array":
|
|
581
|
+
arr_values = [v for v in values if isinstance(v, list)]
|
|
582
|
+
if arr_values:
|
|
583
|
+
dist.avg_array_length = sum(len(a) for a in arr_values) / len(arr_values)
|
|
584
|
+
|
|
585
|
+
return dist
|
|
586
|
+
|
|
587
|
+
def _update_tool_stats(self, signature: ToolSignature, event: CompressionEvent) -> None:
|
|
588
|
+
"""Update aggregated stats for a tool signature."""
|
|
589
|
+
sig_hash = signature.structure_hash
|
|
590
|
+
|
|
591
|
+
if sig_hash not in self._tool_stats:
|
|
592
|
+
self._tool_stats[sig_hash] = AnonymizedToolStats(signature=signature)
|
|
593
|
+
|
|
594
|
+
stats = self._tool_stats[sig_hash]
|
|
595
|
+
|
|
596
|
+
# Update counts
|
|
597
|
+
stats.total_compressions += 1
|
|
598
|
+
stats.total_items_seen += event.original_item_count
|
|
599
|
+
stats.total_items_kept += event.compressed_item_count
|
|
600
|
+
stats.sample_size += 1
|
|
601
|
+
|
|
602
|
+
# Update averages (rolling)
|
|
603
|
+
n = stats.total_compressions
|
|
604
|
+
stats.avg_compression_ratio = (
|
|
605
|
+
stats.avg_compression_ratio * (n - 1) + event.compression_ratio
|
|
606
|
+
) / n
|
|
607
|
+
stats.avg_token_reduction = (
|
|
608
|
+
stats.avg_token_reduction * (n - 1) + event.token_reduction_ratio
|
|
609
|
+
) / n
|
|
610
|
+
|
|
611
|
+
# Update strategy counts
|
|
612
|
+
strategy = event.strategy
|
|
613
|
+
stats.strategy_counts[strategy] = stats.strategy_counts.get(strategy, 0) + 1
|
|
614
|
+
|
|
615
|
+
# Update confidence based on sample size
|
|
616
|
+
stats.confidence = min(0.95, stats.sample_size / 100)
|
|
617
|
+
|
|
618
|
+
# Update recommendations
|
|
619
|
+
self._update_recommendations(sig_hash)
|
|
620
|
+
|
|
621
|
+
def _update_recommendations(self, sig_hash: str) -> None:
|
|
622
|
+
"""Update recommendations based on current data."""
|
|
623
|
+
if sig_hash not in self._tool_stats:
|
|
624
|
+
return
|
|
625
|
+
|
|
626
|
+
stats = self._tool_stats[sig_hash]
|
|
627
|
+
|
|
628
|
+
# Not enough data yet
|
|
629
|
+
if stats.sample_size < self._config.min_samples_for_recommendation:
|
|
630
|
+
return
|
|
631
|
+
|
|
632
|
+
# Check retrieval rate to determine if compression is too aggressive
|
|
633
|
+
if stats.retrieval_stats:
|
|
634
|
+
retrieval_rate = stats.retrieval_stats.retrieval_rate
|
|
635
|
+
full_rate = stats.retrieval_stats.full_retrieval_rate
|
|
636
|
+
|
|
637
|
+
# High retrieval rate = compression too aggressive
|
|
638
|
+
if retrieval_rate > 0.5:
|
|
639
|
+
if full_rate > 0.8:
|
|
640
|
+
# Almost all retrievals are full = skip compression
|
|
641
|
+
stats.skip_compression_recommended = True
|
|
642
|
+
else:
|
|
643
|
+
# Increase min items
|
|
644
|
+
stats.recommended_min_items = 50
|
|
645
|
+
elif retrieval_rate > 0.2:
|
|
646
|
+
# Medium retrieval rate = slightly less aggressive
|
|
647
|
+
stats.recommended_min_items = 30
|
|
648
|
+
else:
|
|
649
|
+
# Low retrieval rate = current settings work
|
|
650
|
+
stats.recommended_min_items = 15
|
|
651
|
+
|
|
652
|
+
# Track frequently queried fields
|
|
653
|
+
if stats.retrieval_stats.query_field_frequency:
|
|
654
|
+
top_fields = sorted(
|
|
655
|
+
stats.retrieval_stats.query_field_frequency.items(),
|
|
656
|
+
key=lambda x: x[1],
|
|
657
|
+
reverse=True,
|
|
658
|
+
)[:5]
|
|
659
|
+
stats.recommended_preserve_fields = [f for f, _ in top_fields]
|
|
660
|
+
|
|
661
|
+
def _merge_tool_stats(
|
|
662
|
+
self,
|
|
663
|
+
existing: AnonymizedToolStats,
|
|
664
|
+
imported: AnonymizedToolStats,
|
|
665
|
+
) -> None:
|
|
666
|
+
"""Merge imported stats into existing."""
|
|
667
|
+
# Weighted average based on sample sizes
|
|
668
|
+
total_samples = existing.sample_size + imported.sample_size
|
|
669
|
+
if total_samples == 0:
|
|
670
|
+
return
|
|
671
|
+
|
|
672
|
+
w_existing = existing.sample_size / total_samples
|
|
673
|
+
w_imported = imported.sample_size / total_samples
|
|
674
|
+
|
|
675
|
+
existing.total_compressions += imported.total_compressions
|
|
676
|
+
existing.total_items_seen += imported.total_items_seen
|
|
677
|
+
existing.total_items_kept += imported.total_items_kept
|
|
678
|
+
existing.avg_compression_ratio = (
|
|
679
|
+
existing.avg_compression_ratio * w_existing
|
|
680
|
+
+ imported.avg_compression_ratio * w_imported
|
|
681
|
+
)
|
|
682
|
+
existing.avg_token_reduction = (
|
|
683
|
+
existing.avg_token_reduction * w_existing + imported.avg_token_reduction * w_imported
|
|
684
|
+
)
|
|
685
|
+
existing.sample_size = total_samples
|
|
686
|
+
|
|
687
|
+
# Merge strategy counts
|
|
688
|
+
for strategy, count in imported.strategy_counts.items():
|
|
689
|
+
existing.strategy_counts[strategy] = existing.strategy_counts.get(strategy, 0) + count
|
|
690
|
+
|
|
691
|
+
# Update confidence
|
|
692
|
+
existing.confidence = min(0.95, total_samples / 100)
|
|
693
|
+
|
|
694
|
+
def _hash_field_name(self, field_name: str) -> str:
|
|
695
|
+
"""Hash a field name for anonymization."""
|
|
696
|
+
if self._config.collect_field_names:
|
|
697
|
+
return field_name
|
|
698
|
+
return hashlib.sha256(field_name.encode()).hexdigest()[:8]
|
|
699
|
+
|
|
700
|
+
def _calculate_avg_compression_ratio(self) -> float:
|
|
701
|
+
"""Calculate average compression ratio across all tools."""
|
|
702
|
+
if not self._tool_stats:
|
|
703
|
+
return 0.0
|
|
704
|
+
ratios = [s.avg_compression_ratio for s in self._tool_stats.values()]
|
|
705
|
+
return sum(ratios) / len(ratios)
|
|
706
|
+
|
|
707
|
+
def _calculate_avg_token_reduction(self) -> float:
|
|
708
|
+
"""Calculate average token reduction across all tools."""
|
|
709
|
+
if not self._tool_stats:
|
|
710
|
+
return 0.0
|
|
711
|
+
reductions = [s.avg_token_reduction for s in self._tool_stats.values()]
|
|
712
|
+
return sum(reductions) / len(reductions)
|
|
713
|
+
|
|
714
|
+
def _should_auto_save(self) -> bool:
|
|
715
|
+
"""Check if auto-save should run. Must be called with lock held."""
|
|
716
|
+
if not self._config.auto_save_interval or not self._config.storage_path:
|
|
717
|
+
return False
|
|
718
|
+
|
|
719
|
+
if not self._dirty:
|
|
720
|
+
return False
|
|
721
|
+
|
|
722
|
+
elapsed = time.time() - self._last_save_time
|
|
723
|
+
return elapsed >= self._config.auto_save_interval
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
# Global collector instance (lazy initialization)
|
|
727
|
+
_telemetry_collector: TelemetryCollector | None = None
|
|
728
|
+
_collector_lock = threading.Lock()
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
def get_telemetry_collector(
|
|
732
|
+
config: TelemetryConfig | None = None,
|
|
733
|
+
) -> TelemetryCollector:
|
|
734
|
+
"""Get the global telemetry collector instance.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
config: Configuration (only used on first call).
|
|
738
|
+
|
|
739
|
+
Returns:
|
|
740
|
+
Global TelemetryCollector instance.
|
|
741
|
+
"""
|
|
742
|
+
global _telemetry_collector
|
|
743
|
+
|
|
744
|
+
if _telemetry_collector is None:
|
|
745
|
+
with _collector_lock:
|
|
746
|
+
if _telemetry_collector is None:
|
|
747
|
+
# Check environment for opt-out
|
|
748
|
+
if os.environ.get("HEADROOM_TELEMETRY_DISABLED", "").lower() in ("1", "true"):
|
|
749
|
+
config = config or TelemetryConfig()
|
|
750
|
+
config.enabled = False
|
|
751
|
+
|
|
752
|
+
_telemetry_collector = TelemetryCollector(config)
|
|
753
|
+
|
|
754
|
+
return _telemetry_collector
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def reset_telemetry_collector() -> None:
|
|
758
|
+
"""Reset the global telemetry collector. Mainly for testing."""
|
|
759
|
+
global _telemetry_collector
|
|
760
|
+
|
|
761
|
+
with _collector_lock:
|
|
762
|
+
if _telemetry_collector is not None:
|
|
763
|
+
_telemetry_collector.clear()
|
|
764
|
+
_telemetry_collector = None
|