headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,764 @@
1
+ """TelemetryCollector for privacy-preserving statistics collection.
2
+
3
+ This module collects anonymized statistics about compression patterns
4
+ to enable cross-user learning and improve compression over time.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import json
11
+ import os
12
+ import threading
13
+ import time
14
+ from dataclasses import dataclass
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from .models import (
19
+ AnonymizedToolStats,
20
+ CompressionEvent,
21
+ FieldDistribution,
22
+ RetrievalStats,
23
+ ToolSignature,
24
+ )
25
+
26
+
27
+ @dataclass
28
+ class TelemetryConfig:
29
+ """Configuration for telemetry collection."""
30
+
31
+ # Enable/disable telemetry
32
+ enabled: bool = True
33
+
34
+ # Storage
35
+ storage_path: str | None = None # Path to store telemetry data (None = in-memory only)
36
+ auto_save_interval: int = 300 # Auto-save every N seconds (0 = disabled)
37
+
38
+ # Privacy settings
39
+ anonymize_tool_names: bool = True # Hash tool names
40
+ collect_field_names: bool = False # If False, only collect field hashes
41
+ collect_timing: bool = True # Collect processing time
42
+
43
+ # Aggregation settings
44
+ max_events_in_memory: int = 10000 # Max events to keep in memory
45
+ min_samples_for_recommendation: int = 10 # Min samples before making recommendations
46
+
47
+ # Export settings
48
+ include_field_distributions: bool = True # Include detailed field stats in export
49
+ include_recommendations: bool = True # Include learned recommendations
50
+
51
+
52
+ class TelemetryCollector:
53
+ """Collects and aggregates compression telemetry.
54
+
55
+ Thread-safe collector that maintains anonymized statistics about
56
+ compression patterns. Can be used to:
57
+ - Understand what tool outputs look like (structurally)
58
+ - Track which compression strategies work best
59
+ - Learn optimal settings per tool type
60
+ - Export data for cross-user aggregation
61
+
62
+ Privacy guarantees:
63
+ - No actual data values are stored
64
+ - Tool names are hashed by default
65
+ - Field names can be hashed
66
+ - No user identifiers
67
+ - No query content
68
+ """
69
+
70
+ def __init__(self, config: TelemetryConfig | None = None):
71
+ """Initialize the telemetry collector.
72
+
73
+ Args:
74
+ config: Configuration options. Uses defaults if not provided.
75
+ """
76
+ self._config = config or TelemetryConfig()
77
+ self._lock = threading.Lock()
78
+
79
+ # Event storage
80
+ self._events: list[CompressionEvent] = []
81
+
82
+ # Aggregated stats per tool signature
83
+ self._tool_stats: dict[str, AnonymizedToolStats] = {}
84
+
85
+ # Retrieval tracking
86
+ self._retrieval_stats: dict[str, RetrievalStats] = {}
87
+
88
+ # Global counters
89
+ self._total_compressions: int = 0
90
+ self._total_retrievals: int = 0
91
+ self._total_tokens_saved: int = 0
92
+
93
+ # Auto-save tracking
94
+ self._last_save_time: float = time.time()
95
+ self._dirty: bool = False
96
+
97
+ # Load existing data if storage path exists
98
+ if self._config.storage_path:
99
+ self._load_from_disk()
100
+
101
+ def record_compression(
102
+ self,
103
+ items: list[dict[str, Any]],
104
+ original_count: int,
105
+ compressed_count: int,
106
+ original_tokens: int,
107
+ compressed_tokens: int,
108
+ strategy: str,
109
+ *,
110
+ tool_name: str | None = None,
111
+ strategy_reason: str | None = None,
112
+ crushability_score: float | None = None,
113
+ crushability_reason: str | None = None,
114
+ kept_first_n: int = 0,
115
+ kept_last_n: int = 0,
116
+ kept_errors: int = 0,
117
+ kept_anomalies: int = 0,
118
+ kept_by_relevance: int = 0,
119
+ kept_by_score: int = 0,
120
+ processing_time_ms: float = 0.0,
121
+ ) -> None:
122
+ """Record a compression event.
123
+
124
+ Args:
125
+ items: Sample items from the original array (for structure analysis).
126
+ original_count: Original number of items.
127
+ compressed_count: Number of items after compression.
128
+ original_tokens: Original token count.
129
+ compressed_tokens: Compressed token count.
130
+ strategy: Compression strategy used.
131
+ tool_name: Optional tool name (will be hashed if configured).
132
+ strategy_reason: Why this strategy was chosen.
133
+ crushability_score: Crushability analysis score.
134
+ crushability_reason: Crushability analysis reason.
135
+ kept_first_n: Items kept from start.
136
+ kept_last_n: Items kept from end.
137
+ kept_errors: Error items kept.
138
+ kept_anomalies: Anomalous items kept.
139
+ kept_by_relevance: Items kept by relevance score.
140
+ kept_by_score: Items kept by score field.
141
+ processing_time_ms: Processing time in milliseconds.
142
+ """
143
+ if not self._config.enabled:
144
+ return
145
+
146
+ # Create tool signature from items
147
+ signature = ToolSignature.from_items(items[:10]) # Sample first 10
148
+
149
+ # Analyze field distributions
150
+ field_distributions: list[FieldDistribution] = []
151
+ if self._config.include_field_distributions and items:
152
+ field_distributions = self._analyze_fields(items[:100]) # Sample 100
153
+
154
+ # Calculate ratios
155
+ compression_ratio = compressed_count / original_count if original_count > 0 else 0.0
156
+ token_reduction = 1 - (compressed_tokens / original_tokens) if original_tokens > 0 else 0.0
157
+
158
+ # Create event
159
+ event = CompressionEvent(
160
+ tool_signature=signature,
161
+ original_item_count=original_count,
162
+ compressed_item_count=compressed_count,
163
+ compression_ratio=compression_ratio,
164
+ original_tokens=original_tokens,
165
+ compressed_tokens=compressed_tokens,
166
+ token_reduction_ratio=token_reduction,
167
+ strategy=strategy,
168
+ strategy_reason=strategy_reason,
169
+ crushability_score=crushability_score,
170
+ crushability_reason=crushability_reason,
171
+ field_distributions=field_distributions,
172
+ kept_first_n=kept_first_n,
173
+ kept_last_n=kept_last_n,
174
+ kept_errors=kept_errors,
175
+ kept_anomalies=kept_anomalies,
176
+ kept_by_relevance=kept_by_relevance,
177
+ kept_by_score=kept_by_score,
178
+ timestamp=time.time(),
179
+ processing_time_ms=processing_time_ms,
180
+ )
181
+
182
+ should_save = False
183
+ with self._lock:
184
+ # Store event
185
+ self._events.append(event)
186
+ if len(self._events) > self._config.max_events_in_memory:
187
+ self._events = self._events[-self._config.max_events_in_memory :]
188
+
189
+ # Update aggregated stats
190
+ self._update_tool_stats(signature, event)
191
+
192
+ # Update global counters
193
+ self._total_compressions += 1
194
+ self._total_tokens_saved += original_tokens - compressed_tokens
195
+ self._dirty = True
196
+
197
+ # Check if auto-save needed (don't actually save while holding lock)
198
+ should_save = self._should_auto_save()
199
+
200
+ # Auto-save outside lock to avoid blocking other operations
201
+ if should_save:
202
+ self.save()
203
+
204
+ def record_retrieval(
205
+ self,
206
+ tool_signature_hash: str,
207
+ retrieval_type: str, # "full" or "search"
208
+ query_fields: list[str] | None = None,
209
+ ) -> None:
210
+ """Record a retrieval event.
211
+
212
+ This is called when an LLM retrieves compressed content, indicating
213
+ the compression may have been too aggressive.
214
+
215
+ Args:
216
+ tool_signature_hash: Hash of the tool signature.
217
+ retrieval_type: "full" (retrieved everything) or "search" (filtered).
218
+ query_fields: Field names mentioned in search query (will be hashed).
219
+ """
220
+ if not self._config.enabled:
221
+ return
222
+
223
+ with self._lock:
224
+ # Get or create retrieval stats
225
+ if tool_signature_hash not in self._retrieval_stats:
226
+ self._retrieval_stats[tool_signature_hash] = RetrievalStats(
227
+ tool_signature_hash=tool_signature_hash
228
+ )
229
+
230
+ stats = self._retrieval_stats[tool_signature_hash]
231
+ stats.total_retrievals += 1
232
+
233
+ if retrieval_type == "full":
234
+ stats.full_retrievals += 1
235
+ else:
236
+ stats.search_retrievals += 1
237
+
238
+ # Track queried fields (anonymized)
239
+ if query_fields:
240
+ for field_name in query_fields:
241
+ field_hash = self._hash_field_name(field_name)
242
+ stats.query_field_frequency[field_hash] = (
243
+ stats.query_field_frequency.get(field_hash, 0) + 1
244
+ )
245
+
246
+ # Update global counter
247
+ self._total_retrievals += 1
248
+ self._dirty = True
249
+
250
+ # Update tool stats with retrieval info
251
+ if tool_signature_hash in self._tool_stats:
252
+ self._tool_stats[tool_signature_hash].retrieval_stats = stats
253
+ self._update_recommendations(tool_signature_hash)
254
+
255
+ def get_stats(self) -> dict[str, Any]:
256
+ """Get overall telemetry statistics.
257
+
258
+ Returns:
259
+ Dictionary with aggregated statistics.
260
+ """
261
+ with self._lock:
262
+ return {
263
+ "enabled": self._config.enabled,
264
+ "total_compressions": self._total_compressions,
265
+ "total_retrievals": self._total_retrievals,
266
+ "total_tokens_saved": self._total_tokens_saved,
267
+ "global_retrieval_rate": (
268
+ self._total_retrievals / self._total_compressions
269
+ if self._total_compressions > 0
270
+ else 0.0
271
+ ),
272
+ "tool_signatures_tracked": len(self._tool_stats),
273
+ "events_in_memory": len(self._events),
274
+ "avg_compression_ratio": self._calculate_avg_compression_ratio(),
275
+ "avg_token_reduction": self._calculate_avg_token_reduction(),
276
+ }
277
+
278
+ def get_tool_stats(self, signature_hash: str) -> AnonymizedToolStats | None:
279
+ """Get statistics for a specific tool signature.
280
+
281
+ Args:
282
+ signature_hash: The tool signature hash.
283
+
284
+ Returns:
285
+ AnonymizedToolStats if found, None otherwise.
286
+ """
287
+ with self._lock:
288
+ return self._tool_stats.get(signature_hash)
289
+
290
+ def get_all_tool_stats(self) -> dict[str, AnonymizedToolStats]:
291
+ """Get statistics for all tracked tool signatures.
292
+
293
+ Returns:
294
+ Dictionary mapping signature hash to stats.
295
+ """
296
+ with self._lock:
297
+ return dict(self._tool_stats)
298
+
299
+ def get_recommendations(self, signature_hash: str) -> dict[str, Any] | None:
300
+ """Get learned recommendations for a tool signature.
301
+
302
+ Args:
303
+ signature_hash: The tool signature hash.
304
+
305
+ Returns:
306
+ Recommendations dictionary if available, None otherwise.
307
+ """
308
+ with self._lock:
309
+ stats = self._tool_stats.get(signature_hash)
310
+ if not stats or stats.sample_size < self._config.min_samples_for_recommendation:
311
+ return None
312
+
313
+ return {
314
+ "signature_hash": signature_hash,
315
+ "recommended_min_items": stats.recommended_min_items,
316
+ "recommended_preserve_fields": stats.recommended_preserve_fields,
317
+ "skip_compression_recommended": stats.skip_compression_recommended,
318
+ "confidence": stats.confidence,
319
+ "based_on_samples": stats.sample_size,
320
+ "retrieval_rate": (
321
+ stats.retrieval_stats.retrieval_rate if stats.retrieval_stats else None
322
+ ),
323
+ }
324
+
325
+ def export_stats(self) -> dict[str, Any]:
326
+ """Export all telemetry data for aggregation.
327
+
328
+ This is the data that can be sent to a central server for
329
+ cross-user learning (with user consent).
330
+
331
+ Returns:
332
+ Complete telemetry export.
333
+ """
334
+ with self._lock:
335
+ export = {
336
+ "version": "1.0",
337
+ "export_timestamp": time.time(),
338
+ "summary": {
339
+ "total_compressions": self._total_compressions,
340
+ "total_retrievals": self._total_retrievals,
341
+ "total_tokens_saved": self._total_tokens_saved,
342
+ "tool_signatures_tracked": len(self._tool_stats),
343
+ },
344
+ "tool_stats": {
345
+ sig_hash: stats.to_dict() for sig_hash, stats in self._tool_stats.items()
346
+ },
347
+ }
348
+
349
+ if self._config.include_recommendations:
350
+ export["recommendations"] = {
351
+ sig_hash: {
352
+ "recommended_min_items": stats.recommended_min_items,
353
+ "skip_compression_recommended": stats.skip_compression_recommended,
354
+ "confidence": stats.confidence,
355
+ }
356
+ for sig_hash, stats in self._tool_stats.items()
357
+ if stats.sample_size >= self._config.min_samples_for_recommendation
358
+ }
359
+
360
+ return export
361
+
362
+ def import_stats(self, data: dict[str, Any]) -> None:
363
+ """Import telemetry data from another source.
364
+
365
+ This allows merging stats from multiple users for cross-user learning.
366
+
367
+ Args:
368
+ data: Exported telemetry data.
369
+ """
370
+ if not self._config.enabled:
371
+ return
372
+
373
+ with self._lock:
374
+ # Import summary counters
375
+ summary = data.get("summary", {})
376
+ self._total_compressions += summary.get("total_compressions", 0)
377
+ self._total_retrievals += summary.get("total_retrievals", 0)
378
+ self._total_tokens_saved += summary.get("total_tokens_saved", 0)
379
+
380
+ # Import tool stats
381
+ tool_stats_data = data.get("tool_stats", {})
382
+ for sig_hash, stats_dict in tool_stats_data.items():
383
+ if sig_hash in self._tool_stats:
384
+ # Merge with existing
385
+ existing = self._tool_stats[sig_hash]
386
+ imported = AnonymizedToolStats.from_dict(stats_dict)
387
+ self._merge_tool_stats(existing, imported)
388
+ else:
389
+ # Add new
390
+ self._tool_stats[sig_hash] = AnonymizedToolStats.from_dict(stats_dict)
391
+
392
+ self._dirty = True
393
+
394
+ def clear(self) -> None:
395
+ """Clear all telemetry data. Mainly for testing."""
396
+ with self._lock:
397
+ self._events.clear()
398
+ self._tool_stats.clear()
399
+ self._retrieval_stats.clear()
400
+ self._total_compressions = 0
401
+ self._total_retrievals = 0
402
+ self._total_tokens_saved = 0
403
+ self._dirty = False
404
+
405
+ def save(self) -> None:
406
+ """Save telemetry data to disk."""
407
+ if not self._config.storage_path:
408
+ return
409
+
410
+ with self._lock:
411
+ # Build export data inline to avoid deadlock (export_stats also acquires lock)
412
+ data = {
413
+ "version": "1.0",
414
+ "export_timestamp": time.time(),
415
+ "summary": {
416
+ "total_compressions": self._total_compressions,
417
+ "total_retrievals": self._total_retrievals,
418
+ "total_tokens_saved": self._total_tokens_saved,
419
+ "tool_signatures_tracked": len(self._tool_stats),
420
+ },
421
+ "tool_stats": {
422
+ sig_hash: stats.to_dict() for sig_hash, stats in self._tool_stats.items()
423
+ },
424
+ }
425
+
426
+ if self._config.include_recommendations:
427
+ data["recommendations"] = {
428
+ sig_hash: {
429
+ "recommended_min_items": stats.recommended_min_items,
430
+ "skip_compression_recommended": stats.skip_compression_recommended,
431
+ "confidence": stats.confidence,
432
+ }
433
+ for sig_hash, stats in self._tool_stats.items()
434
+ if stats.sample_size >= self._config.min_samples_for_recommendation
435
+ }
436
+
437
+ path = Path(self._config.storage_path)
438
+ path.parent.mkdir(parents=True, exist_ok=True)
439
+
440
+ with open(path, "w") as f:
441
+ json.dump(data, f, indent=2)
442
+
443
+ self._dirty = False
444
+ self._last_save_time = time.time()
445
+
446
+ def _load_from_disk(self) -> None:
447
+ """Load telemetry data from disk."""
448
+ if not self._config.storage_path:
449
+ return
450
+
451
+ path = Path(self._config.storage_path)
452
+ if not path.exists():
453
+ return
454
+
455
+ try:
456
+ with open(path) as f:
457
+ data = json.load(f)
458
+ self.import_stats(data)
459
+ self._dirty = False
460
+ except (json.JSONDecodeError, OSError):
461
+ pass # Start fresh if file is corrupted
462
+
463
+ def _analyze_fields(self, items: list[dict[str, Any]]) -> list[FieldDistribution]:
464
+ """Analyze field distributions in items."""
465
+ if not items:
466
+ return []
467
+
468
+ distributions: list[FieldDistribution] = []
469
+
470
+ # Get all field names from first item
471
+ sample = items[0] if isinstance(items[0], dict) else {}
472
+ for field_name, _sample_value in sample.items():
473
+ # Collect all values for this field
474
+ values = [
475
+ item.get(field_name)
476
+ for item in items
477
+ if isinstance(item, dict) and field_name in item
478
+ ]
479
+
480
+ if not values:
481
+ continue
482
+
483
+ dist = self._create_field_distribution(field_name, values)
484
+ distributions.append(dist)
485
+
486
+ return distributions
487
+
488
+ def _create_field_distribution(
489
+ self,
490
+ field_name: str,
491
+ values: list[Any],
492
+ ) -> FieldDistribution:
493
+ """Create a FieldDistribution from values."""
494
+ field_hash = self._hash_field_name(field_name)
495
+
496
+ # Determine type
497
+ type_counts: dict[str, int] = {}
498
+ for v in values:
499
+ if isinstance(v, str):
500
+ type_counts["string"] = type_counts.get("string", 0) + 1
501
+ elif isinstance(v, bool):
502
+ type_counts["boolean"] = type_counts.get("boolean", 0) + 1
503
+ elif isinstance(v, (int, float)):
504
+ type_counts["numeric"] = type_counts.get("numeric", 0) + 1
505
+ elif isinstance(v, list):
506
+ type_counts["array"] = type_counts.get("array", 0) + 1
507
+ elif isinstance(v, dict):
508
+ type_counts["object"] = type_counts.get("object", 0) + 1
509
+ elif v is None:
510
+ type_counts["null"] = type_counts.get("null", 0) + 1
511
+
512
+ # Get dominant type
513
+ if not type_counts:
514
+ field_type = "null"
515
+ elif len(type_counts) > 1:
516
+ field_type = "mixed"
517
+ else:
518
+ field_type = list(type_counts.keys())[0]
519
+
520
+ dist = FieldDistribution(
521
+ field_name_hash=field_hash,
522
+ field_type=field_type, # type: ignore[arg-type]
523
+ )
524
+
525
+ # Type-specific analysis
526
+ if field_type == "string":
527
+ str_values = [v for v in values if isinstance(v, str)]
528
+ if str_values:
529
+ dist.avg_length = sum(len(s) for s in str_values) / len(str_values)
530
+ unique_count = len(set(str_values))
531
+ dist.unique_ratio = unique_count / len(str_values)
532
+ dist.looks_like_id = dist.unique_ratio > 0.9 and dist.avg_length > 5
533
+
534
+ elif field_type == "numeric":
535
+ num_values = [v for v in values if isinstance(v, (int, float))]
536
+ # Filter out infinity and NaN which can cause issues
537
+ num_values = [
538
+ v
539
+ for v in num_values
540
+ if not (
541
+ isinstance(v, float) and (v != v or v == float("inf") or v == float("-inf"))
542
+ )
543
+ ]
544
+ if num_values:
545
+ dist.has_negative = any(v < 0 for v in num_values)
546
+ # Safe integer check (avoid OverflowError from int(inf))
547
+ dist.is_integer = all(
548
+ isinstance(v, int) or (isinstance(v, float) and v.is_integer())
549
+ for v in num_values
550
+ )
551
+
552
+ if len(num_values) > 1:
553
+ mean = sum(num_values) / len(num_values)
554
+ variance = sum((v - mean) ** 2 for v in num_values) / len(num_values)
555
+ dist.has_variance = variance > 0
556
+
557
+ if variance == 0:
558
+ dist.variance_bucket = "zero"
559
+ elif variance < 10:
560
+ dist.variance_bucket = "low"
561
+ elif variance < 1000:
562
+ dist.variance_bucket = "medium"
563
+ else:
564
+ dist.variance_bucket = "high"
565
+
566
+ # Check for outliers
567
+ std = variance**0.5
568
+ if std > 0:
569
+ outliers = sum(1 for v in num_values if abs(v - mean) > 2 * std)
570
+ dist.has_outliers = outliers > 0
571
+
572
+ # Pattern detection
573
+ sorted_vals = sorted(num_values)
574
+ is_monotonic = (
575
+ sorted_vals == num_values or list(reversed(sorted_vals)) == num_values
576
+ )
577
+ if is_monotonic and dist.variance_bucket in ("medium", "high"):
578
+ dist.is_likely_score = True
579
+
580
+ elif field_type == "array":
581
+ arr_values = [v for v in values if isinstance(v, list)]
582
+ if arr_values:
583
+ dist.avg_array_length = sum(len(a) for a in arr_values) / len(arr_values)
584
+
585
+ return dist
586
+
587
+ def _update_tool_stats(self, signature: ToolSignature, event: CompressionEvent) -> None:
588
+ """Update aggregated stats for a tool signature."""
589
+ sig_hash = signature.structure_hash
590
+
591
+ if sig_hash not in self._tool_stats:
592
+ self._tool_stats[sig_hash] = AnonymizedToolStats(signature=signature)
593
+
594
+ stats = self._tool_stats[sig_hash]
595
+
596
+ # Update counts
597
+ stats.total_compressions += 1
598
+ stats.total_items_seen += event.original_item_count
599
+ stats.total_items_kept += event.compressed_item_count
600
+ stats.sample_size += 1
601
+
602
+ # Update averages (rolling)
603
+ n = stats.total_compressions
604
+ stats.avg_compression_ratio = (
605
+ stats.avg_compression_ratio * (n - 1) + event.compression_ratio
606
+ ) / n
607
+ stats.avg_token_reduction = (
608
+ stats.avg_token_reduction * (n - 1) + event.token_reduction_ratio
609
+ ) / n
610
+
611
+ # Update strategy counts
612
+ strategy = event.strategy
613
+ stats.strategy_counts[strategy] = stats.strategy_counts.get(strategy, 0) + 1
614
+
615
+ # Update confidence based on sample size
616
+ stats.confidence = min(0.95, stats.sample_size / 100)
617
+
618
+ # Update recommendations
619
+ self._update_recommendations(sig_hash)
620
+
621
+ def _update_recommendations(self, sig_hash: str) -> None:
622
+ """Update recommendations based on current data."""
623
+ if sig_hash not in self._tool_stats:
624
+ return
625
+
626
+ stats = self._tool_stats[sig_hash]
627
+
628
+ # Not enough data yet
629
+ if stats.sample_size < self._config.min_samples_for_recommendation:
630
+ return
631
+
632
+ # Check retrieval rate to determine if compression is too aggressive
633
+ if stats.retrieval_stats:
634
+ retrieval_rate = stats.retrieval_stats.retrieval_rate
635
+ full_rate = stats.retrieval_stats.full_retrieval_rate
636
+
637
+ # High retrieval rate = compression too aggressive
638
+ if retrieval_rate > 0.5:
639
+ if full_rate > 0.8:
640
+ # Almost all retrievals are full = skip compression
641
+ stats.skip_compression_recommended = True
642
+ else:
643
+ # Increase min items
644
+ stats.recommended_min_items = 50
645
+ elif retrieval_rate > 0.2:
646
+ # Medium retrieval rate = slightly less aggressive
647
+ stats.recommended_min_items = 30
648
+ else:
649
+ # Low retrieval rate = current settings work
650
+ stats.recommended_min_items = 15
651
+
652
+ # Track frequently queried fields
653
+ if stats.retrieval_stats.query_field_frequency:
654
+ top_fields = sorted(
655
+ stats.retrieval_stats.query_field_frequency.items(),
656
+ key=lambda x: x[1],
657
+ reverse=True,
658
+ )[:5]
659
+ stats.recommended_preserve_fields = [f for f, _ in top_fields]
660
+
661
+ def _merge_tool_stats(
662
+ self,
663
+ existing: AnonymizedToolStats,
664
+ imported: AnonymizedToolStats,
665
+ ) -> None:
666
+ """Merge imported stats into existing."""
667
+ # Weighted average based on sample sizes
668
+ total_samples = existing.sample_size + imported.sample_size
669
+ if total_samples == 0:
670
+ return
671
+
672
+ w_existing = existing.sample_size / total_samples
673
+ w_imported = imported.sample_size / total_samples
674
+
675
+ existing.total_compressions += imported.total_compressions
676
+ existing.total_items_seen += imported.total_items_seen
677
+ existing.total_items_kept += imported.total_items_kept
678
+ existing.avg_compression_ratio = (
679
+ existing.avg_compression_ratio * w_existing
680
+ + imported.avg_compression_ratio * w_imported
681
+ )
682
+ existing.avg_token_reduction = (
683
+ existing.avg_token_reduction * w_existing + imported.avg_token_reduction * w_imported
684
+ )
685
+ existing.sample_size = total_samples
686
+
687
+ # Merge strategy counts
688
+ for strategy, count in imported.strategy_counts.items():
689
+ existing.strategy_counts[strategy] = existing.strategy_counts.get(strategy, 0) + count
690
+
691
+ # Update confidence
692
+ existing.confidence = min(0.95, total_samples / 100)
693
+
694
+ def _hash_field_name(self, field_name: str) -> str:
695
+ """Hash a field name for anonymization."""
696
+ if self._config.collect_field_names:
697
+ return field_name
698
+ return hashlib.sha256(field_name.encode()).hexdigest()[:8]
699
+
700
+ def _calculate_avg_compression_ratio(self) -> float:
701
+ """Calculate average compression ratio across all tools."""
702
+ if not self._tool_stats:
703
+ return 0.0
704
+ ratios = [s.avg_compression_ratio for s in self._tool_stats.values()]
705
+ return sum(ratios) / len(ratios)
706
+
707
+ def _calculate_avg_token_reduction(self) -> float:
708
+ """Calculate average token reduction across all tools."""
709
+ if not self._tool_stats:
710
+ return 0.0
711
+ reductions = [s.avg_token_reduction for s in self._tool_stats.values()]
712
+ return sum(reductions) / len(reductions)
713
+
714
+ def _should_auto_save(self) -> bool:
715
+ """Check if auto-save should run. Must be called with lock held."""
716
+ if not self._config.auto_save_interval or not self._config.storage_path:
717
+ return False
718
+
719
+ if not self._dirty:
720
+ return False
721
+
722
+ elapsed = time.time() - self._last_save_time
723
+ return elapsed >= self._config.auto_save_interval
724
+
725
+
726
+ # Global collector instance (lazy initialization)
727
+ _telemetry_collector: TelemetryCollector | None = None
728
+ _collector_lock = threading.Lock()
729
+
730
+
731
+ def get_telemetry_collector(
732
+ config: TelemetryConfig | None = None,
733
+ ) -> TelemetryCollector:
734
+ """Get the global telemetry collector instance.
735
+
736
+ Args:
737
+ config: Configuration (only used on first call).
738
+
739
+ Returns:
740
+ Global TelemetryCollector instance.
741
+ """
742
+ global _telemetry_collector
743
+
744
+ if _telemetry_collector is None:
745
+ with _collector_lock:
746
+ if _telemetry_collector is None:
747
+ # Check environment for opt-out
748
+ if os.environ.get("HEADROOM_TELEMETRY_DISABLED", "").lower() in ("1", "true"):
749
+ config = config or TelemetryConfig()
750
+ config.enabled = False
751
+
752
+ _telemetry_collector = TelemetryCollector(config)
753
+
754
+ return _telemetry_collector
755
+
756
+
757
+ def reset_telemetry_collector() -> None:
758
+ """Reset the global telemetry collector. Mainly for testing."""
759
+ global _telemetry_collector
760
+
761
+ with _collector_lock:
762
+ if _telemetry_collector is not None:
763
+ _telemetry_collector.clear()
764
+ _telemetry_collector = None